diff mbox series

[ovs-dev,2/4] pinctrl: Send periodic arp/nd to ecmp next-hops.

Message ID daf0621441d3d1606aae1ab21d1cd403ad6d264e.1731495611.git.lorenzo.bianconi@redhat.com
State Changes Requested
Delegated to: Dumitru Ceara
Headers show
Series Introduce ECMP_nexthop monitor in ovn-controller | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Lorenzo Bianconi Nov. 13, 2024, 11:04 a.m. UTC
Introduce the capbility to periodically send ARP/ND packets for
ECMP nexthops in order to resolve their L2 address. This is a preliminary
patch to introduce the capability to flush stale ECMP CT entries.

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 NEWS                            |   5 +
 controller/ovn-controller.8.xml |  10 ++
 controller/ovn-controller.c     |   2 +
 controller/pinctrl.c            | 284 +++++++++++++++++++++++++++++++-
 controller/pinctrl.h            |   2 +
 5 files changed, 300 insertions(+), 3 deletions(-)

Comments

Dumitru Ceara Dec. 18, 2024, 12:11 p.m. UTC | #1
On 11/13/24 12:04 PM, Lorenzo Bianconi wrote:
> Introduce the capbility to periodically send ARP/ND packets for
> ECMP nexthops in order to resolve their L2 address. This is a preliminary
> patch to introduce the capability to flush stale ECMP CT entries.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> ---

Hi Lorenzo,

Thanks for the patch!

>  NEWS                            |   5 +
>  controller/ovn-controller.8.xml |  10 ++
>  controller/ovn-controller.c     |   2 +
>  controller/pinctrl.c            | 284 +++++++++++++++++++++++++++++++-
>  controller/pinctrl.h            |   2 +
>  5 files changed, 300 insertions(+), 3 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index da3aba739..1f8f54d5d 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -4,6 +4,11 @@ Post v24.09.0
>       hash (with specified hash fields) for ECMP routes
>       while choosing nexthop.
>     - ovn-ic: Add support for route tag to prevent route learning.
> +   - Add "arp-max-timeout-sec" config option to vswitchd external-ids to
> +     cap the time between when ovn-controller sends ARP/ND packets for
> +     ECMP-nexthop.

This is not used anywhere.  The system tests in the last patch set it
but ovn-controller never reads it.

Maybe it's because I'm not a native speaker but "to cap the time between
when ovn-controller sends ARP/ND packets for ECMP-nexthop" doesn't make
sense to me.  What if we change this to:

- Add "arp-max-timeout-sec" config option to vswitchd external-ids to
  configure the interval (in seconds) between ovn-controller originated
  ARP/ND packets used for tracking ECMP next hop MAC addresses.


> +     By default ovn-controller continuously sends ARP/ND packets for
> +     ECMP-nexthop.

Why would we ever want some other behavior?  If I understand correctly
the idea is that we need to continuously monitor next hops in case they
migrated to a new mac.  I think we need to change this (unused?) option
to allow us to just configure the interval between ARP requests with a
reasonable default value.

>  
>  OVN v24.09.0 - 13 Sep 2024
>  --------------------------
> diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
> index aeaa374c1..7f95a9932 100644
> --- a/controller/ovn-controller.8.xml
> +++ b/controller/ovn-controller.8.xml
> @@ -385,6 +385,16 @@
>          cap for the exponential backoff used by <code>ovn-controller</code>
>          to send GARPs packets.
>        </dd>
> +      <dt><code>external_ids:arp-nd-max-timeout-sec</code></dt>
> +      <dd>
> +        When used, this configuration value specifies the maximum timeout
> +        (in seconds) between two consecutive ARP/ND packets sent by
> +        <code>ovn-controller</code> to resolve ECMP nexthop mac address.
> +        <code>ovn-controller</code> by default continuously sends ARP/ND
> +        packets. Setting <code>external_ids:arp-nd-max-timeout-sec</code>
> +        allows to cap for the exponential backoff used by <code>ovn-controller
> +        </code> to send ARPs/NDs packets.
> +      </dd>

Oh, now I see.  The NEWS item is wrong, the option is actually called
"arp-nd-max-timeout-sec".  That also means the tests in the last patch
configure the wrong external_id.

>        <dt><code>external_ids:ovn-bridge-remote</code></dt>
>        <dd>
>          <p>
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 98b144699..ecfa3b229 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c

We also need to implement conditional monitoring for the SB.ECMP_Nexthop
table.  That is, we need to add code for it in update_sb_monitors() to
only monitor records whose datapaths are local.

> @@ -5743,6 +5743,8 @@ main(int argc, char *argv[])
>                                      sbrec_mac_binding_table_get(
>                                          ovnsb_idl_loop.idl),
>                                      sbrec_bfd_table_get(ovnsb_idl_loop.idl),
> +                                    sbrec_ecmp_nexthop_table_get(
> +                                        ovnsb_idl_loop.idl),
>                                      br_int, chassis,
>                                      &runtime_data->local_datapaths,
>                                      &runtime_data->active_tunnels,
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index 3fb7e2fd7..eb6043ef8 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -164,6 +164,9 @@ static struct seq *pinctrl_main_seq;
>  static long long int garp_rarp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
>  static bool garp_rarp_continuous;
>  
> +static long long int arp_nd_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> +static bool arp_nd_continuous;
> +
>  static void *pinctrl_handler(void *arg);
>  
>  struct pinctrl {
> @@ -223,13 +226,17 @@ static void run_activated_ports(
>      const struct sbrec_chassis *chassis);
>  
>  static void init_send_garps_rarps(void);
> +static void init_send_arps_nds(void);
>  static void destroy_send_garps_rarps(void);
> +static void destroy_send_arps_nds(void);
>  static void send_garp_rarp_wait(long long int send_garp_rarp_time);
> +static void send_arp_nd_wait(long long int send_arp_nd_time);
>  static void send_garp_rarp_prepare(
>      struct ovsdb_idl_txn *ovnsb_idl_txn,
>      struct ovsdb_idl_index *sbrec_port_binding_by_datapath,
>      struct ovsdb_idl_index *sbrec_port_binding_by_name,
>      struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip,
> +    const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
>      const struct ovsrec_bridge *,
>      const struct sbrec_chassis *,
>      const struct hmap *local_datapaths,
> @@ -239,6 +246,9 @@ static void send_garp_rarp_prepare(
>  static void send_garp_rarp_run(struct rconn *swconn,
>                                 long long int *send_garp_rarp_time)
>      OVS_REQUIRES(pinctrl_mutex);
> +static void send_arp_nd_run(struct rconn *swconn,
> +                            long long int *send_arp_nd_time)
> +    OVS_REQUIRES(pinctrl_mutex);
>  static void pinctrl_handle_nd_na(struct rconn *swconn,
>                                   const struct flow *ip_flow,
>                                   const struct match *md,
> @@ -548,6 +558,7 @@ pinctrl_init(void)
>  {
>      init_put_mac_bindings();
>      init_send_garps_rarps();
> +    init_send_arps_nds();
>      init_ipv6_ras();
>      init_ipv6_prefixd();
>      init_buffered_packets_ctx();
> @@ -3878,6 +3889,7 @@ pinctrl_handler(void *arg_)
>      static long long int send_ipv6_ra_time = LLONG_MAX;
>      /* Next GARP/RARP announcement in ms. */
>      static long long int send_garp_rarp_time = LLONG_MAX;
> +    static long long int send_arp_nd_time = LLONG_MAX;
>      /* Next multicast query (IGMP) in ms. */
>      static long long int send_mcast_query_time = LLONG_MAX;
>      static long long int svc_monitors_next_run_time = LLONG_MAX;
> @@ -3915,6 +3927,7 @@ pinctrl_handler(void *arg_)
>              if (may_inject_pkts()) {
>                  ovs_mutex_lock(&pinctrl_mutex);
>                  send_garp_rarp_run(swconn, &send_garp_rarp_time);
> +                send_arp_nd_run(swconn, &send_arp_nd_time);
>                  send_ipv6_ras(swconn, &send_ipv6_ra_time);
>                  send_ipv6_prefixd(swconn, &send_prefixd_time);
>                  send_mac_binding_buffered_pkts(swconn);
> @@ -3933,6 +3946,7 @@ pinctrl_handler(void *arg_)
>          rconn_recv_wait(swconn);
>          if (rconn_is_connected(swconn)) {
>              send_garp_rarp_wait(send_garp_rarp_time);
> +            send_arp_nd_wait(send_arp_nd_time);
>              ipv6_ra_wait(send_ipv6_ra_time);
>              ip_mcast_querier_wait(send_mcast_query_time);
>              svc_monitors_wait(svc_monitors_next_run_time);
> @@ -4019,6 +4033,7 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>              const struct sbrec_service_monitor_table *svc_mon_table,
>              const struct sbrec_mac_binding_table *mac_binding_table,
>              const struct sbrec_bfd_table *bfd_table,
> +            const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
>              const struct ovsrec_bridge *br_int,
>              const struct sbrec_chassis *chassis,
>              const struct hmap *local_datapaths,
> @@ -4035,8 +4050,9 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>                             sbrec_port_binding_by_key, chassis);
>      send_garp_rarp_prepare(ovnsb_idl_txn, sbrec_port_binding_by_datapath,
>                             sbrec_port_binding_by_name,
> -                           sbrec_mac_binding_by_lport_ip, br_int, chassis,
> -                           local_datapaths, active_tunnels, ovs_table);
> +                           sbrec_mac_binding_by_lport_ip, ecmp_nh_table,
> +                           br_int, chassis, local_datapaths, active_tunnels,
> +                           ovs_table);
>      prepare_ipv6_ras(local_active_ports_ras, sbrec_port_binding_by_name);
>      prepare_ipv6_prefixd(ovnsb_idl_txn, sbrec_port_binding_by_name,
>                           local_active_ports_ipv6_pd, chassis,
> @@ -4570,6 +4586,7 @@ pinctrl_destroy(void)
>      latch_destroy(&pinctrl.pinctrl_thread_exit);
>      rconn_destroy(pinctrl.swconn);
>      destroy_send_garps_rarps();
> +    destroy_send_arps_nds();
>      destroy_ipv6_ras();
>      destroy_ipv6_prefixd();
>      destroy_buffered_packets_ctx();
> @@ -5077,6 +5094,150 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn,
>      }
>  }
>  
> +struct arp_nd_data {
> +    struct hmap_node hmap_node;
> +    struct eth_addr ea;          /* Ethernet address of port. */
> +    struct in6_addr src_ip;      /* IP address of port. */
> +    struct in6_addr dst_ip;      /* Destination IP address */
> +    long long int announce_time; /* Next announcement in ms. */
> +    int backoff;                 /* Backoff timeout for the next
> +                                  * announcement (in msecs). */
> +    uint32_t dp_key;             /* Datapath used to output this GARP. */
> +    uint32_t port_key;           /* Port to inject the GARP into. */
> +};
> +
> +static struct hmap send_arp_nd_data;
> +
> +static void
> +init_send_arps_nds(void)
> +{
> +    hmap_init(&send_arp_nd_data);
> +}
> +
> +static void
> +destroy_send_arps_nds(void)
> +{
> +    struct arp_nd_data *e;
> +    HMAP_FOR_EACH_POP (e, hmap_node, &send_arp_nd_data) {
> +        free(e);
> +    }
> +    hmap_destroy(&send_arp_nd_data);
> +}
> +
> +static struct arp_nd_data *
> +arp_nd_find_data(const struct sbrec_port_binding *pb,
> +                 const struct in6_addr *nexthop)
> +{
> +    uint32_t hash = 0;
> +
> +    hash = hash_add_in6_addr(hash, nexthop);
> +    hash = hash_add(hash, pb->datapath->tunnel_key);
> +    hash = hash_add(hash, pb->tunnel_key);
> +
> +    struct arp_nd_data *e;
> +    HMAP_FOR_EACH_WITH_HASH (e, hmap_node, hash, &send_arp_nd_data) {
> +        if (ipv6_addr_equals(&e->dst_ip, nexthop) &&
> +            e->port_key == pb->tunnel_key) {
> +            return e;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static bool
> +arp_nd_find_is_stale(const struct arp_nd_data *e,
> +                     const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
> +                     const struct sbrec_chassis *chassis)
> +{
> +    const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop;
> +    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) {
> +        const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port;
> +        if (pb->chassis != chassis) {
> +            continue;
> +        }
> +
> +        struct lport_addresses laddrs;
> +        if (!extract_ip_addresses(sb_ecmp_nexthop->nexthop, &laddrs)) {
> +            continue;
> +        }
> +
> +        struct in6_addr dst_ip = laddrs.n_ipv4_addrs
> +            ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr)
> +            : laddrs.ipv6_addrs[0].addr;
> +        destroy_lport_addresses(&laddrs);
> +
> +        if (pb->tunnel_key == e->port_key &&
> +            pb->datapath->tunnel_key == e->dp_key &&
> +            ipv6_addr_equals(&e->dst_ip, &dst_ip)) {
> +            return false;
> +        }
> +    }

This is inefficient.  We should instead iterate all SB next hops and do
a lookup in the arp_nd_data map.  All records from the arp_nd_data map
that weren't found by corresponding SB.ECMP_Nexthop records should be
considered stale.

> +    return true;
> +}
> +
> +static struct arp_nd_data *
> +arp_nd_alloc_data(const struct eth_addr ea,
> +                  struct in6_addr src_ip, struct in6_addr dst_ip,
> +                  const struct sbrec_port_binding *pb)
> +{
> +    struct arp_nd_data *e = xmalloc(sizeof *e);
> +    e->ea = ea;
> +    e->src_ip = src_ip;
> +    e->dst_ip = dst_ip;
> +    e->announce_time = time_msec() + 1000;
> +    e->backoff = 1000; /* msec. */
> +    e->dp_key = pb->datapath->tunnel_key;
> +    e->port_key = pb->tunnel_key;
> +
> +    uint32_t hash = 0;
> +    hash = hash_add_in6_addr(hash, &dst_ip);
> +    hash = hash_add(hash, e->dp_key);
> +    hash = hash_add(hash, e->port_key);
> +    hmap_insert(&send_arp_nd_data, &e->hmap_node, hash);
> +    notify_pinctrl_handler();
> +
> +    return e;
> +}
> +
> +/* Add or update a vif for which ARPs need to be announced. */
> +static void
> +send_arp_nd_update(const struct sbrec_port_binding *pb, const char *nexthop,
> +                   long long int max_arp_timeout, bool continuous_arp_nd)
> +{
> +    struct lport_addresses laddrs;
> +    if (!extract_ip_addresses(nexthop, &laddrs)) {
> +        return;
> +    }
> +
> +    struct in6_addr dst_ip = laddrs.n_ipv4_addrs
> +        ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr)
> +        : laddrs.ipv6_addrs[0].addr;
> +    destroy_lport_addresses(&laddrs);
> +
> +    struct arp_nd_data *e = arp_nd_find_data(pb, &dst_ip);
> +    if (!e) {
> +        if (!extract_lsp_addresses(pb->mac[0], &laddrs)) {
> +            return;
> +        }
> +
> +        if (laddrs.n_ipv4_addrs) {
> +            arp_nd_alloc_data(laddrs.ea,
> +                              in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr),
> +                              dst_ip, pb);
> +        } else if (laddrs.n_ipv6_addrs) {
> +            arp_nd_alloc_data(laddrs.ea, laddrs.ipv6_addrs[0].addr,
> +                              dst_ip, pb);
> +        }
> +        destroy_lport_addresses(&laddrs);

Do we need to notify pinctrl_handler in this case?

> +    } else if (max_arp_timeout != arp_nd_max_timeout ||
> +               continuous_arp_nd != arp_nd_continuous) {
> +        /* reset backoff */
> +        e->announce_time = time_msec() + 1000;
> +        e->backoff = 1000; /* msec. */

Same here?

> +    }
> +}
> +
>  /* Remove a vif from GARP announcements. */
>  static void
>  send_garp_rarp_delete(const char *lport)
> @@ -6415,6 +6576,16 @@ send_garp_rarp_wait(long long int send_garp_rarp_time)
>      }
>  }
>  
> +static void
> +send_arp_nd_wait(long long int send_arp_nd_time)
> +{
> +    /* Set the poll timer for next arp packet only if there is data to
> +     * be sent. */
> +    if (hmap_count(&send_arp_nd_data)) {
> +        poll_timer_wait_until(send_arp_nd_time);
> +    }
> +}
> +
>  /* Called with in the pinctrl_handler thread context. */
>  static void
>  send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time)
> @@ -6437,6 +6608,84 @@ send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time)
>      }
>  }
>  
> +static long long int
> +send_arp_nd(struct rconn *swconn, struct arp_nd_data *e,
> +            long long int current_time)

There's still a lot of code duplication with send_garp_rarp().  Let's
add a single send_self_originated_neigh_packet() to accept source and
destination IPs as in6_addr.

For example:

send_self_originated_neigh_packet(uint32_t dp_key, uint32_t port_key,
                                  struct eth_addr eth,
                                  struct in6_addr *local,
                                  struct in6_addr *target)
{
    ...
    if (!local) {
        compose_rarp(&packet, eth);
    } else if (IN6_IS_ADDR_V4MAPPED(local)) {
        compose_arp(&packet, ARP_OP_REQUEST, eth,
                    eth_addr_zero, true,
                    in6_addr_get_mapped_ipv4(local),
                    in6_addr_get_mapped_ipv4(target));
    } else {
        compose_nd_ns(&packet, eth, source, target);
    }
    ...
}

We can call this from send_garp_rarp() and send_arp_nd() with
appropriate arguments.

> +    OVS_REQUIRES(pinctrl_mutex)
> +{
> +    if (current_time < e->announce_time) {
> +        return e->announce_time;
> +    }
> +
> +    /* Compose a ARP request packet. */
> +    uint64_t packet_stub[128 / 8];
> +    struct dp_packet packet;
> +    dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub);
> +    if (IN6_IS_ADDR_V4MAPPED(&e->src_ip)) {
> +        compose_arp(&packet, ARP_OP_REQUEST, e->ea, eth_addr_zero,
> +                    true, in6_addr_get_mapped_ipv4(&e->src_ip),
> +                    in6_addr_get_mapped_ipv4(&e->dst_ip));
> +    } else {
> +        compose_nd_ns(&packet, e->ea, &e->src_ip, &e->dst_ip);
> +    }
> +
> +    /* Inject ARP request. */
> +    uint64_t ofpacts_stub[4096 / 8];
> +    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
> +    enum ofp_version version = rconn_get_version(swconn);
> +    put_load(e->dp_key, MFF_LOG_DATAPATH, 0, 64, &ofpacts);
> +    put_load(e->port_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
> +    struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts);
> +    resubmit->in_port = OFPP_CONTROLLER;
> +    resubmit->table_id = OFTABLE_LOCAL_OUTPUT;

As fas as I can tell, there's no real reason to not reinject from the
beginning of the pipeline (like we do with GARPs):

resubmit->table_id = OFTABLE_LOG_INGRESS_PIPELINE;

> +
> +    struct ofputil_packet_out po = {
> +        .packet = dp_packet_data(&packet),
> +        .packet_len = dp_packet_size(&packet),
> +        .buffer_id = UINT32_MAX,
> +        .ofpacts = ofpacts.data,
> +        .ofpacts_len = ofpacts.size,
> +    };
> +    match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER);
> +    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
> +    queue_msg(swconn, ofputil_encode_packet_out(&po, proto));
> +    dp_packet_uninit(&packet);
> +    ofpbuf_uninit(&ofpacts);
> +
> +    /* Set the next announcement.  At most 5 announcements are sent for a
> +     * vif if arp_nd_max_timeout is not specified otherwise cap the max
> +     * timeout to arp_nd_max_timeout. */
> +    if (arp_nd_continuous || e->backoff < arp_nd_max_timeout) {
> +        e->announce_time = current_time + e->backoff;
> +    } else {
> +        e->announce_time = LLONG_MAX;
> +    }
> +    e->backoff = MIN(arp_nd_max_timeout, e->backoff * 2);
> +
> +    return e->announce_time;
> +}
> +
> +static void
> +send_arp_nd_run(struct rconn *swconn, long long int *send_arp_nd_time)
> +    OVS_REQUIRES(pinctrl_mutex)
> +{
> +    if (!hmap_count(&send_arp_nd_data)) {
> +        return;
> +    }
> +
> +    /* Send ARPs, and update the next announcement. */
> +    long long int current_time = time_msec();
> +    *send_arp_nd_time = LLONG_MAX;
> +
> +    struct arp_nd_data *e;
> +    HMAP_FOR_EACH (e, hmap_node, &send_arp_nd_data) {
> +        long long int next_announce = send_arp_nd(swconn, e, current_time);
> +        if (*send_arp_nd_time > next_announce) {
> +            *send_arp_nd_time = next_announce;
> +        }
> +    }
> +}
> +
>  /* Called by pinctrl_run(). Runs with in the main ovn-controller
>   * thread context. */
>  static void
> @@ -6444,6 +6693,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
>                         struct ovsdb_idl_index *sbrec_port_binding_by_datapath,
>                         struct ovsdb_idl_index *sbrec_port_binding_by_name,
>                         struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip,
> +                       const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
>                         const struct ovsrec_bridge *br_int,
>                         const struct sbrec_chassis *chassis,
>                         const struct hmap *local_datapaths,
> @@ -6456,7 +6706,8 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
>      struct sset nat_ip_keys = SSET_INITIALIZER(&nat_ip_keys);
>      struct shash nat_addresses;
>      unsigned long long garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> -    bool garp_continuous = false;
> +    unsigned long long max_arp_nd_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> +    bool garp_continuous = false, continuous_arp_nd = true;
>      const struct ovsrec_open_vswitch *cfg =
>          ovsrec_open_vswitch_table_first(ovs_table);
>      if (cfg) {
> @@ -6466,6 +6717,11 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
>          if (!garp_max_timeout) {
>              garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
>          }
> +
> +        max_arp_nd_timeout = smap_get_ullong(
> +                &cfg->external_ids, "arp-nd-max-timeout-sec",
> +                GARP_RARP_DEF_MAX_TIMEOUT / 1000) * 1000;
> +        continuous_arp_nd = !!max_arp_nd_timeout;
>      }
>  
>      shash_init(&nat_addresses);
> @@ -6479,6 +6735,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
>                                 &nat_ip_keys, &local_l3gw_ports,
>                                 chassis, active_tunnels,
>                                 &nat_addresses);
> +
>      /* For deleted ports and deleted nat ips, remove from
>       * send_garp_rarp_data. */
>      struct shash_node *iter;
> @@ -6514,6 +6771,24 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
>          }
>      }
>  
> +    struct arp_nd_data *e;
> +    const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop;
> +    HMAP_FOR_EACH_SAFE (e, hmap_node, &send_arp_nd_data) {
> +        if (arp_nd_find_is_stale(e, ecmp_nh_table, chassis)) {
> +            hmap_remove(&send_arp_nd_data, &e->hmap_node);
> +            free(e);
> +            notify_pinctrl_handler();

This takes the pinctrl_handler_seq lock every time and tries to wake up
any waiters (but those are probably blocked in the pinctrl_mutex).  We
should probably just notify once, outside the loop, if there's any stale
entry.

On the other hand, why do we even need to notify pinctrl_handler()?  We
removed entries, we don't need to generate new ARPs.  Or am I missing
something?


> +        }
> +    }
> +
> +    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) {
> +        const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port;
> +        if (pb && !strcmp(pb->type, "l3gateway") && pb->chassis == chassis) {
> +            send_arp_nd_update(pb, sb_ecmp_nexthop->nexthop,
> +                               max_arp_nd_timeout, continuous_arp_nd);
> +        }
> +    }
> +
>      /* pinctrl_handler thread will send the GARPs. */
>  
>      sset_destroy(&localnet_vifs);
> @@ -6531,6 +6806,9 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
>  
>      garp_rarp_max_timeout = garp_max_timeout;
>      garp_rarp_continuous = garp_continuous;
> +
> +    arp_nd_max_timeout = max_arp_nd_timeout;
> +    arp_nd_continuous = continuous_arp_nd;
>  }
>  
>  static bool
> diff --git a/controller/pinctrl.h b/controller/pinctrl.h
> index 846afe0a4..8459f4f53 100644
> --- a/controller/pinctrl.h
> +++ b/controller/pinctrl.h
> @@ -36,6 +36,7 @@ struct sbrec_dns_table;
>  struct sbrec_controller_event_table;
>  struct sbrec_service_monitor_table;
>  struct sbrec_bfd_table;
> +struct sbrec_ecmp_nexthop_table;
>  struct sbrec_port_binding;
>  struct sbrec_mac_binding_table;
>  
> @@ -53,6 +54,7 @@ void pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>                   const struct sbrec_service_monitor_table *,
>                   const struct sbrec_mac_binding_table *,
>                   const struct sbrec_bfd_table *,
> +                 const struct sbrec_ecmp_nexthop_table *,
>                   const struct ovsrec_bridge *, const struct sbrec_chassis *,
>                   const struct hmap *local_datapaths,
>                   const struct sset *active_tunnels,

Regards,
Dumitru
Lorenzo Bianconi Dec. 19, 2024, 10:50 p.m. UTC | #2
> On 11/13/24 12:04 PM, Lorenzo Bianconi wrote:
> > Introduce the capbility to periodically send ARP/ND packets for
> > ECMP nexthops in order to resolve their L2 address. This is a preliminary
> > patch to introduce the capability to flush stale ECMP CT entries.
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> > ---
> 
> Hi Lorenzo,
> 
> Thanks for the patch!

Hi Dumitru,

thx for the review.

> 
> >  NEWS                            |   5 +
> >  controller/ovn-controller.8.xml |  10 ++
> >  controller/ovn-controller.c     |   2 +
> >  controller/pinctrl.c            | 284 +++++++++++++++++++++++++++++++-
> >  controller/pinctrl.h            |   2 +
> >  5 files changed, 300 insertions(+), 3 deletions(-)
> > 
> > diff --git a/NEWS b/NEWS
> > index da3aba739..1f8f54d5d 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -4,6 +4,11 @@ Post v24.09.0
> >       hash (with specified hash fields) for ECMP routes
> >       while choosing nexthop.
> >     - ovn-ic: Add support for route tag to prevent route learning.
> > +   - Add "arp-max-timeout-sec" config option to vswitchd external-ids to
> > +     cap the time between when ovn-controller sends ARP/ND packets for
> > +     ECMP-nexthop.
> 
> This is not used anywhere.  The system tests in the last patch set it
> but ovn-controller never reads it.
> 
> Maybe it's because I'm not a native speaker but "to cap the time between
> when ovn-controller sends ARP/ND packets for ECMP-nexthop" doesn't make
> sense to me.  What if we change this to:
> 
> - Add "arp-max-timeout-sec" config option to vswitchd external-ids to
>   configure the interval (in seconds) between ovn-controller originated
>   ARP/ND packets used for tracking ECMP next hop MAC addresses.
> 
> 
> > +     By default ovn-controller continuously sends ARP/ND packets for
> > +     ECMP-nexthop.
> 
> Why would we ever want some other behavior?  If I understand correctly
> the idea is that we need to continuously monitor next hops in case they
> migrated to a new mac.  I think we need to change this (unused?) option
> to allow us to just configure the interval between ARP requests with a
> reasonable default value.
> 
> >  
> >  OVN v24.09.0 - 13 Sep 2024
> >  --------------------------
> > diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
> > index aeaa374c1..7f95a9932 100644
> > --- a/controller/ovn-controller.8.xml
> > +++ b/controller/ovn-controller.8.xml
> > @@ -385,6 +385,16 @@
> >          cap for the exponential backoff used by <code>ovn-controller</code>
> >          to send GARPs packets.
> >        </dd>
> > +      <dt><code>external_ids:arp-nd-max-timeout-sec</code></dt>
> > +      <dd>
> > +        When used, this configuration value specifies the maximum timeout
> > +        (in seconds) between two consecutive ARP/ND packets sent by
> > +        <code>ovn-controller</code> to resolve ECMP nexthop mac address.
> > +        <code>ovn-controller</code> by default continuously sends ARP/ND
> > +        packets. Setting <code>external_ids:arp-nd-max-timeout-sec</code>
> > +        allows to cap for the exponential backoff used by <code>ovn-controller
> > +        </code> to send ARPs/NDs packets.
> > +      </dd>
> 
> Oh, now I see.  The NEWS item is wrong, the option is actually called
> "arp-nd-max-timeout-sec".  That also means the tests in the last patch
> configure the wrong external_id.
> 
> >        <dt><code>external_ids:ovn-bridge-remote</code></dt>
> >        <dd>
> >          <p>
> > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> > index 98b144699..ecfa3b229 100644
> > --- a/controller/ovn-controller.c
> > +++ b/controller/ovn-controller.c
> 
> We also need to implement conditional monitoring for the SB.ECMP_Nexthop
> table.  That is, we need to add code for it in update_sb_monitors() to
> only monitor records whose datapaths are local.

ack, I will add it.

> 
> > @@ -5743,6 +5743,8 @@ main(int argc, char *argv[])
> >                                      sbrec_mac_binding_table_get(
> >                                          ovnsb_idl_loop.idl),
> >                                      sbrec_bfd_table_get(ovnsb_idl_loop.idl),
> > +                                    sbrec_ecmp_nexthop_table_get(
> > +                                        ovnsb_idl_loop.idl),
> >                                      br_int, chassis,
> >                                      &runtime_data->local_datapaths,
> >                                      &runtime_data->active_tunnels,
> > diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> > index 3fb7e2fd7..eb6043ef8 100644
> > --- a/controller/pinctrl.c
> > +++ b/controller/pinctrl.c
> > @@ -164,6 +164,9 @@ static struct seq *pinctrl_main_seq;
> >  static long long int garp_rarp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> >  static bool garp_rarp_continuous;
> >  
> > +static long long int arp_nd_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> > +static bool arp_nd_continuous;
> > +
> >  static void *pinctrl_handler(void *arg);
> >  
> >  struct pinctrl {
> > @@ -223,13 +226,17 @@ static void run_activated_ports(
> >      const struct sbrec_chassis *chassis);
> >  
> >  static void init_send_garps_rarps(void);
> > +static void init_send_arps_nds(void);
> >  static void destroy_send_garps_rarps(void);
> > +static void destroy_send_arps_nds(void);
> >  static void send_garp_rarp_wait(long long int send_garp_rarp_time);
> > +static void send_arp_nd_wait(long long int send_arp_nd_time);
> >  static void send_garp_rarp_prepare(
> >      struct ovsdb_idl_txn *ovnsb_idl_txn,
> >      struct ovsdb_idl_index *sbrec_port_binding_by_datapath,
> >      struct ovsdb_idl_index *sbrec_port_binding_by_name,
> >      struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip,
> > +    const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
> >      const struct ovsrec_bridge *,
> >      const struct sbrec_chassis *,
> >      const struct hmap *local_datapaths,
> > @@ -239,6 +246,9 @@ static void send_garp_rarp_prepare(
> >  static void send_garp_rarp_run(struct rconn *swconn,
> >                                 long long int *send_garp_rarp_time)
> >      OVS_REQUIRES(pinctrl_mutex);
> > +static void send_arp_nd_run(struct rconn *swconn,
> > +                            long long int *send_arp_nd_time)
> > +    OVS_REQUIRES(pinctrl_mutex);
> >  static void pinctrl_handle_nd_na(struct rconn *swconn,
> >                                   const struct flow *ip_flow,
> >                                   const struct match *md,
> > @@ -548,6 +558,7 @@ pinctrl_init(void)
> >  {
> >      init_put_mac_bindings();
> >      init_send_garps_rarps();
> > +    init_send_arps_nds();
> >      init_ipv6_ras();
> >      init_ipv6_prefixd();
> >      init_buffered_packets_ctx();
> > @@ -3878,6 +3889,7 @@ pinctrl_handler(void *arg_)
> >      static long long int send_ipv6_ra_time = LLONG_MAX;
> >      /* Next GARP/RARP announcement in ms. */
> >      static long long int send_garp_rarp_time = LLONG_MAX;
> > +    static long long int send_arp_nd_time = LLONG_MAX;
> >      /* Next multicast query (IGMP) in ms. */
> >      static long long int send_mcast_query_time = LLONG_MAX;
> >      static long long int svc_monitors_next_run_time = LLONG_MAX;
> > @@ -3915,6 +3927,7 @@ pinctrl_handler(void *arg_)
> >              if (may_inject_pkts()) {
> >                  ovs_mutex_lock(&pinctrl_mutex);
> >                  send_garp_rarp_run(swconn, &send_garp_rarp_time);
> > +                send_arp_nd_run(swconn, &send_arp_nd_time);
> >                  send_ipv6_ras(swconn, &send_ipv6_ra_time);
> >                  send_ipv6_prefixd(swconn, &send_prefixd_time);
> >                  send_mac_binding_buffered_pkts(swconn);
> > @@ -3933,6 +3946,7 @@ pinctrl_handler(void *arg_)
> >          rconn_recv_wait(swconn);
> >          if (rconn_is_connected(swconn)) {
> >              send_garp_rarp_wait(send_garp_rarp_time);
> > +            send_arp_nd_wait(send_arp_nd_time);
> >              ipv6_ra_wait(send_ipv6_ra_time);
> >              ip_mcast_querier_wait(send_mcast_query_time);
> >              svc_monitors_wait(svc_monitors_next_run_time);
> > @@ -4019,6 +4033,7 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >              const struct sbrec_service_monitor_table *svc_mon_table,
> >              const struct sbrec_mac_binding_table *mac_binding_table,
> >              const struct sbrec_bfd_table *bfd_table,
> > +            const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
> >              const struct ovsrec_bridge *br_int,
> >              const struct sbrec_chassis *chassis,
> >              const struct hmap *local_datapaths,
> > @@ -4035,8 +4050,9 @@ pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >                             sbrec_port_binding_by_key, chassis);
> >      send_garp_rarp_prepare(ovnsb_idl_txn, sbrec_port_binding_by_datapath,
> >                             sbrec_port_binding_by_name,
> > -                           sbrec_mac_binding_by_lport_ip, br_int, chassis,
> > -                           local_datapaths, active_tunnels, ovs_table);
> > +                           sbrec_mac_binding_by_lport_ip, ecmp_nh_table,
> > +                           br_int, chassis, local_datapaths, active_tunnels,
> > +                           ovs_table);
> >      prepare_ipv6_ras(local_active_ports_ras, sbrec_port_binding_by_name);
> >      prepare_ipv6_prefixd(ovnsb_idl_txn, sbrec_port_binding_by_name,
> >                           local_active_ports_ipv6_pd, chassis,
> > @@ -4570,6 +4586,7 @@ pinctrl_destroy(void)
> >      latch_destroy(&pinctrl.pinctrl_thread_exit);
> >      rconn_destroy(pinctrl.swconn);
> >      destroy_send_garps_rarps();
> > +    destroy_send_arps_nds();
> >      destroy_ipv6_ras();
> >      destroy_ipv6_prefixd();
> >      destroy_buffered_packets_ctx();
> > @@ -5077,6 +5094,150 @@ send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >      }
> >  }
> >  
> > +struct arp_nd_data {
> > +    struct hmap_node hmap_node;
> > +    struct eth_addr ea;          /* Ethernet address of port. */
> > +    struct in6_addr src_ip;      /* IP address of port. */
> > +    struct in6_addr dst_ip;      /* Destination IP address */
> > +    long long int announce_time; /* Next announcement in ms. */
> > +    int backoff;                 /* Backoff timeout for the next
> > +                                  * announcement (in msecs). */
> > +    uint32_t dp_key;             /* Datapath used to output this GARP. */
> > +    uint32_t port_key;           /* Port to inject the GARP into. */
> > +};
> > +
> > +static struct hmap send_arp_nd_data;
> > +
> > +static void
> > +init_send_arps_nds(void)
> > +{
> > +    hmap_init(&send_arp_nd_data);
> > +}
> > +
> > +static void
> > +destroy_send_arps_nds(void)
> > +{
> > +    struct arp_nd_data *e;
> > +    HMAP_FOR_EACH_POP (e, hmap_node, &send_arp_nd_data) {
> > +        free(e);
> > +    }
> > +    hmap_destroy(&send_arp_nd_data);
> > +}
> > +
> > +static struct arp_nd_data *
> > +arp_nd_find_data(const struct sbrec_port_binding *pb,
> > +                 const struct in6_addr *nexthop)
> > +{
> > +    uint32_t hash = 0;
> > +
> > +    hash = hash_add_in6_addr(hash, nexthop);
> > +    hash = hash_add(hash, pb->datapath->tunnel_key);
> > +    hash = hash_add(hash, pb->tunnel_key);
> > +
> > +    struct arp_nd_data *e;
> > +    HMAP_FOR_EACH_WITH_HASH (e, hmap_node, hash, &send_arp_nd_data) {
> > +        if (ipv6_addr_equals(&e->dst_ip, nexthop) &&
> > +            e->port_key == pb->tunnel_key) {
> > +            return e;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static bool
> > +arp_nd_find_is_stale(const struct arp_nd_data *e,
> > +                     const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
> > +                     const struct sbrec_chassis *chassis)
> > +{
> > +    const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop;
> > +    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) {
> > +        const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port;
> > +        if (pb->chassis != chassis) {
> > +            continue;
> > +        }
> > +
> > +        struct lport_addresses laddrs;
> > +        if (!extract_ip_addresses(sb_ecmp_nexthop->nexthop, &laddrs)) {
> > +            continue;
> > +        }
> > +
> > +        struct in6_addr dst_ip = laddrs.n_ipv4_addrs
> > +            ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr)
> > +            : laddrs.ipv6_addrs[0].addr;
> > +        destroy_lport_addresses(&laddrs);
> > +
> > +        if (pb->tunnel_key == e->port_key &&
> > +            pb->datapath->tunnel_key == e->dp_key &&
> > +            ipv6_addr_equals(&e->dst_ip, &dst_ip)) {
> > +            return false;
> > +        }
> > +    }
> 
> This is inefficient.  We should instead iterate all SB next hops and do
> a lookup in the arp_nd_data map.  All records from the arp_nd_data map
> that weren't found by corresponding SB.ECMP_Nexthop records should be
> considered stale.

ack, I will fix it.

> 
> > +    return true;
> > +}
> > +
> > +static struct arp_nd_data *
> > +arp_nd_alloc_data(const struct eth_addr ea,
> > +                  struct in6_addr src_ip, struct in6_addr dst_ip,
> > +                  const struct sbrec_port_binding *pb)
> > +{
> > +    struct arp_nd_data *e = xmalloc(sizeof *e);
> > +    e->ea = ea;
> > +    e->src_ip = src_ip;
> > +    e->dst_ip = dst_ip;
> > +    e->announce_time = time_msec() + 1000;
> > +    e->backoff = 1000; /* msec. */
> > +    e->dp_key = pb->datapath->tunnel_key;
> > +    e->port_key = pb->tunnel_key;
> > +
> > +    uint32_t hash = 0;
> > +    hash = hash_add_in6_addr(hash, &dst_ip);
> > +    hash = hash_add(hash, e->dp_key);
> > +    hash = hash_add(hash, e->port_key);
> > +    hmap_insert(&send_arp_nd_data, &e->hmap_node, hash);
> > +    notify_pinctrl_handler();
> > +
> > +    return e;
> > +}
> > +
> > +/* Add or update a vif for which ARPs need to be announced. */
> > +static void
> > +send_arp_nd_update(const struct sbrec_port_binding *pb, const char *nexthop,
> > +                   long long int max_arp_timeout, bool continuous_arp_nd)
> > +{
> > +    struct lport_addresses laddrs;
> > +    if (!extract_ip_addresses(nexthop, &laddrs)) {
> > +        return;
> > +    }
> > +
> > +    struct in6_addr dst_ip = laddrs.n_ipv4_addrs
> > +        ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr)
> > +        : laddrs.ipv6_addrs[0].addr;
> > +    destroy_lport_addresses(&laddrs);
> > +
> > +    struct arp_nd_data *e = arp_nd_find_data(pb, &dst_ip);
> > +    if (!e) {
> > +        if (!extract_lsp_addresses(pb->mac[0], &laddrs)) {
> > +            return;
> > +        }
> > +
> > +        if (laddrs.n_ipv4_addrs) {
> > +            arp_nd_alloc_data(laddrs.ea,
> > +                              in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr),
> > +                              dst_ip, pb);
> > +        } else if (laddrs.n_ipv6_addrs) {
> > +            arp_nd_alloc_data(laddrs.ea, laddrs.ipv6_addrs[0].addr,
> > +                              dst_ip, pb);
> > +        }
> > +        destroy_lport_addresses(&laddrs);
> 
> Do we need to notify pinctrl_handler in this case?

this is done in in arp_nd_alloc_data()

> 
> > +    } else if (max_arp_timeout != arp_nd_max_timeout ||
> > +               continuous_arp_nd != arp_nd_continuous) {
> > +        /* reset backoff */
> > +        e->announce_time = time_msec() + 1000;
> > +        e->backoff = 1000; /* msec. */
> 
> Same here?

ack, I will fix it

> 
> > +    }
> > +}
> > +
> >  /* Remove a vif from GARP announcements. */
> >  static void
> >  send_garp_rarp_delete(const char *lport)
> > @@ -6415,6 +6576,16 @@ send_garp_rarp_wait(long long int send_garp_rarp_time)
> >      }
> >  }
> >  
> > +static void
> > +send_arp_nd_wait(long long int send_arp_nd_time)
> > +{
> > +    /* Set the poll timer for next arp packet only if there is data to
> > +     * be sent. */
> > +    if (hmap_count(&send_arp_nd_data)) {
> > +        poll_timer_wait_until(send_arp_nd_time);
> > +    }
> > +}
> > +
> >  /* Called with in the pinctrl_handler thread context. */
> >  static void
> >  send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time)
> > @@ -6437,6 +6608,84 @@ send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time)
> >      }
> >  }
> >  
> > +static long long int
> > +send_arp_nd(struct rconn *swconn, struct arp_nd_data *e,
> > +            long long int current_time)
> 
> There's still a lot of code duplication with send_garp_rarp().  Let's
> add a single send_self_originated_neigh_packet() to accept source and
> destination IPs as in6_addr.
> 
> For example:
> 
> send_self_originated_neigh_packet(uint32_t dp_key, uint32_t port_key,
>                                   struct eth_addr eth,
>                                   struct in6_addr *local,
>                                   struct in6_addr *target)
> {
>     ...
>     if (!local) {
>         compose_rarp(&packet, eth);
>     } else if (IN6_IS_ADDR_V4MAPPED(local)) {
>         compose_arp(&packet, ARP_OP_REQUEST, eth,
>                     eth_addr_zero, true,
>                     in6_addr_get_mapped_ipv4(local),
>                     in6_addr_get_mapped_ipv4(target));
>     } else {
>         compose_nd_ns(&packet, eth, source, target);
>     }
>     ...
> }

ack, I will fix it

> 
> We can call this from send_garp_rarp() and send_arp_nd() with
> appropriate arguments.
> 
> > +    OVS_REQUIRES(pinctrl_mutex)
> > +{
> > +    if (current_time < e->announce_time) {
> > +        return e->announce_time;
> > +    }
> > +
> > +    /* Compose a ARP request packet. */
> > +    uint64_t packet_stub[128 / 8];
> > +    struct dp_packet packet;
> > +    dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub);
> > +    if (IN6_IS_ADDR_V4MAPPED(&e->src_ip)) {
> > +        compose_arp(&packet, ARP_OP_REQUEST, e->ea, eth_addr_zero,
> > +                    true, in6_addr_get_mapped_ipv4(&e->src_ip),
> > +                    in6_addr_get_mapped_ipv4(&e->dst_ip));
> > +    } else {
> > +        compose_nd_ns(&packet, e->ea, &e->src_ip, &e->dst_ip);
> > +    }
> > +
> > +    /* Inject ARP request. */
> > +    uint64_t ofpacts_stub[4096 / 8];
> > +    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
> > +    enum ofp_version version = rconn_get_version(swconn);
> > +    put_load(e->dp_key, MFF_LOG_DATAPATH, 0, 64, &ofpacts);
> > +    put_load(e->port_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
> > +    struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts);
> > +    resubmit->in_port = OFPP_CONTROLLER;
> > +    resubmit->table_id = OFTABLE_LOCAL_OUTPUT;
> 
> As fas as I can tell, there's no real reason to not reinject from the
> beginning of the pipeline (like we do with GARPs):
> 
> resubmit->table_id = OFTABLE_LOG_INGRESS_PIPELINE;

I guess we need to resubmit to OFTABLE_LOCAL_OUTPUT since port_key is the
outport and not the input one.

> 
> > +
> > +    struct ofputil_packet_out po = {
> > +        .packet = dp_packet_data(&packet),
> > +        .packet_len = dp_packet_size(&packet),
> > +        .buffer_id = UINT32_MAX,
> > +        .ofpacts = ofpacts.data,
> > +        .ofpacts_len = ofpacts.size,
> > +    };
> > +    match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER);
> > +    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
> > +    queue_msg(swconn, ofputil_encode_packet_out(&po, proto));
> > +    dp_packet_uninit(&packet);
> > +    ofpbuf_uninit(&ofpacts);
> > +
> > +    /* Set the next announcement.  At most 5 announcements are sent for a
> > +     * vif if arp_nd_max_timeout is not specified otherwise cap the max
> > +     * timeout to arp_nd_max_timeout. */
> > +    if (arp_nd_continuous || e->backoff < arp_nd_max_timeout) {
> > +        e->announce_time = current_time + e->backoff;
> > +    } else {
> > +        e->announce_time = LLONG_MAX;
> > +    }
> > +    e->backoff = MIN(arp_nd_max_timeout, e->backoff * 2);
> > +
> > +    return e->announce_time;
> > +}
> > +
> > +static void
> > +send_arp_nd_run(struct rconn *swconn, long long int *send_arp_nd_time)
> > +    OVS_REQUIRES(pinctrl_mutex)
> > +{
> > +    if (!hmap_count(&send_arp_nd_data)) {
> > +        return;
> > +    }
> > +
> > +    /* Send ARPs, and update the next announcement. */
> > +    long long int current_time = time_msec();
> > +    *send_arp_nd_time = LLONG_MAX;
> > +
> > +    struct arp_nd_data *e;
> > +    HMAP_FOR_EACH (e, hmap_node, &send_arp_nd_data) {
> > +        long long int next_announce = send_arp_nd(swconn, e, current_time);
> > +        if (*send_arp_nd_time > next_announce) {
> > +            *send_arp_nd_time = next_announce;
> > +        }
> > +    }
> > +}
> > +
> >  /* Called by pinctrl_run(). Runs with in the main ovn-controller
> >   * thread context. */
> >  static void
> > @@ -6444,6 +6693,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >                         struct ovsdb_idl_index *sbrec_port_binding_by_datapath,
> >                         struct ovsdb_idl_index *sbrec_port_binding_by_name,
> >                         struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip,
> > +                       const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
> >                         const struct ovsrec_bridge *br_int,
> >                         const struct sbrec_chassis *chassis,
> >                         const struct hmap *local_datapaths,
> > @@ -6456,7 +6706,8 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >      struct sset nat_ip_keys = SSET_INITIALIZER(&nat_ip_keys);
> >      struct shash nat_addresses;
> >      unsigned long long garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> > -    bool garp_continuous = false;
> > +    unsigned long long max_arp_nd_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> > +    bool garp_continuous = false, continuous_arp_nd = true;
> >      const struct ovsrec_open_vswitch *cfg =
> >          ovsrec_open_vswitch_table_first(ovs_table);
> >      if (cfg) {
> > @@ -6466,6 +6717,11 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >          if (!garp_max_timeout) {
> >              garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
> >          }
> > +
> > +        max_arp_nd_timeout = smap_get_ullong(
> > +                &cfg->external_ids, "arp-nd-max-timeout-sec",
> > +                GARP_RARP_DEF_MAX_TIMEOUT / 1000) * 1000;
> > +        continuous_arp_nd = !!max_arp_nd_timeout;
> >      }
> >  
> >      shash_init(&nat_addresses);
> > @@ -6479,6 +6735,7 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >                                 &nat_ip_keys, &local_l3gw_ports,
> >                                 chassis, active_tunnels,
> >                                 &nat_addresses);
> > +
> >      /* For deleted ports and deleted nat ips, remove from
> >       * send_garp_rarp_data. */
> >      struct shash_node *iter;
> > @@ -6514,6 +6771,24 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >          }
> >      }
> >  
> > +    struct arp_nd_data *e;
> > +    const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop;
> > +    HMAP_FOR_EACH_SAFE (e, hmap_node, &send_arp_nd_data) {
> > +        if (arp_nd_find_is_stale(e, ecmp_nh_table, chassis)) {
> > +            hmap_remove(&send_arp_nd_data, &e->hmap_node);
> > +            free(e);
> > +            notify_pinctrl_handler();
> 
> This takes the pinctrl_handler_seq lock every time and tries to wake up
> any waiters (but those are probably blocked in the pinctrl_mutex).  We
> should probably just notify once, outside the loop, if there's any stale
> entry.
> 
> On the other hand, why do we even need to notify pinctrl_handler()?  We
> removed entries, we don't need to generate new ARPs.  Or am I missing
> something?

ack, I will fix it.

Regards,
Lorenzo

> 
> 
> > +        }
> > +    }
> > +
> > +    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) {
> > +        const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port;
> > +        if (pb && !strcmp(pb->type, "l3gateway") && pb->chassis == chassis) {
> > +            send_arp_nd_update(pb, sb_ecmp_nexthop->nexthop,
> > +                               max_arp_nd_timeout, continuous_arp_nd);
> > +        }
> > +    }
> > +
> >      /* pinctrl_handler thread will send the GARPs. */
> >  
> >      sset_destroy(&localnet_vifs);
> > @@ -6531,6 +6806,9 @@ send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >  
> >      garp_rarp_max_timeout = garp_max_timeout;
> >      garp_rarp_continuous = garp_continuous;
> > +
> > +    arp_nd_max_timeout = max_arp_nd_timeout;
> > +    arp_nd_continuous = continuous_arp_nd;
> >  }
> >  
> >  static bool
> > diff --git a/controller/pinctrl.h b/controller/pinctrl.h
> > index 846afe0a4..8459f4f53 100644
> > --- a/controller/pinctrl.h
> > +++ b/controller/pinctrl.h
> > @@ -36,6 +36,7 @@ struct sbrec_dns_table;
> >  struct sbrec_controller_event_table;
> >  struct sbrec_service_monitor_table;
> >  struct sbrec_bfd_table;
> > +struct sbrec_ecmp_nexthop_table;
> >  struct sbrec_port_binding;
> >  struct sbrec_mac_binding_table;
> >  
> > @@ -53,6 +54,7 @@ void pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
> >                   const struct sbrec_service_monitor_table *,
> >                   const struct sbrec_mac_binding_table *,
> >                   const struct sbrec_bfd_table *,
> > +                 const struct sbrec_ecmp_nexthop_table *,
> >                   const struct ovsrec_bridge *, const struct sbrec_chassis *,
> >                   const struct hmap *local_datapaths,
> >                   const struct sset *active_tunnels,
> 
> Regards,
> Dumitru
>
Dumitru Ceara Dec. 20, 2024, 9:49 a.m. UTC | #3
On 12/19/24 11:50 PM, Lorenzo Bianconi wrote:
>>> +    uint64_t ofpacts_stub[4096 / 8];
>>> +    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
>>> +    enum ofp_version version = rconn_get_version(swconn);
>>> +    put_load(e->dp_key, MFF_LOG_DATAPATH, 0, 64, &ofpacts);
>>> +    put_load(e->port_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
>>> +    struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts);
>>> +    resubmit->in_port = OFPP_CONTROLLER;
>>> +    resubmit->table_id = OFTABLE_LOCAL_OUTPUT;
>> As fas as I can tell, there's no real reason to not reinject from the
>> beginning of the pipeline (like we do with GARPs):
>>
>> resubmit->table_id = OFTABLE_LOG_INGRESS_PIPELINE;
> I guess we need to resubmit to OFTABLE_LOCAL_OUTPUT since port_key is the
> outport and not the input one.

I see, OK.

Thanks,
Dumitru
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index da3aba739..1f8f54d5d 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,11 @@  Post v24.09.0
      hash (with specified hash fields) for ECMP routes
      while choosing nexthop.
    - ovn-ic: Add support for route tag to prevent route learning.
+   - Add "arp-max-timeout-sec" config option to vswitchd external-ids to
+     cap the time between when ovn-controller sends ARP/ND packets for
+     ECMP-nexthop.
+     By default ovn-controller continuously sends ARP/ND packets for
+     ECMP-nexthop.
 
 OVN v24.09.0 - 13 Sep 2024
 --------------------------
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
index aeaa374c1..7f95a9932 100644
--- a/controller/ovn-controller.8.xml
+++ b/controller/ovn-controller.8.xml
@@ -385,6 +385,16 @@ 
         cap for the exponential backoff used by <code>ovn-controller</code>
         to send GARPs packets.
       </dd>
+      <dt><code>external_ids:arp-nd-max-timeout-sec</code></dt>
+      <dd>
+        When used, this configuration value specifies the maximum timeout
+        (in seconds) between two consecutive ARP/ND packets sent by
+        <code>ovn-controller</code> to resolve ECMP nexthop mac address.
+        <code>ovn-controller</code> by default continuously sends ARP/ND
+        packets. Setting <code>external_ids:arp-nd-max-timeout-sec</code>
+        allows to cap for the exponential backoff used by <code>ovn-controller
+        </code> to send ARPs/NDs packets.
+      </dd>
       <dt><code>external_ids:ovn-bridge-remote</code></dt>
       <dd>
         <p>
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 98b144699..ecfa3b229 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -5743,6 +5743,8 @@  main(int argc, char *argv[])
                                     sbrec_mac_binding_table_get(
                                         ovnsb_idl_loop.idl),
                                     sbrec_bfd_table_get(ovnsb_idl_loop.idl),
+                                    sbrec_ecmp_nexthop_table_get(
+                                        ovnsb_idl_loop.idl),
                                     br_int, chassis,
                                     &runtime_data->local_datapaths,
                                     &runtime_data->active_tunnels,
diff --git a/controller/pinctrl.c b/controller/pinctrl.c
index 3fb7e2fd7..eb6043ef8 100644
--- a/controller/pinctrl.c
+++ b/controller/pinctrl.c
@@ -164,6 +164,9 @@  static struct seq *pinctrl_main_seq;
 static long long int garp_rarp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
 static bool garp_rarp_continuous;
 
+static long long int arp_nd_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
+static bool arp_nd_continuous;
+
 static void *pinctrl_handler(void *arg);
 
 struct pinctrl {
@@ -223,13 +226,17 @@  static void run_activated_ports(
     const struct sbrec_chassis *chassis);
 
 static void init_send_garps_rarps(void);
+static void init_send_arps_nds(void);
 static void destroy_send_garps_rarps(void);
+static void destroy_send_arps_nds(void);
 static void send_garp_rarp_wait(long long int send_garp_rarp_time);
+static void send_arp_nd_wait(long long int send_arp_nd_time);
 static void send_garp_rarp_prepare(
     struct ovsdb_idl_txn *ovnsb_idl_txn,
     struct ovsdb_idl_index *sbrec_port_binding_by_datapath,
     struct ovsdb_idl_index *sbrec_port_binding_by_name,
     struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip,
+    const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
     const struct ovsrec_bridge *,
     const struct sbrec_chassis *,
     const struct hmap *local_datapaths,
@@ -239,6 +246,9 @@  static void send_garp_rarp_prepare(
 static void send_garp_rarp_run(struct rconn *swconn,
                                long long int *send_garp_rarp_time)
     OVS_REQUIRES(pinctrl_mutex);
+static void send_arp_nd_run(struct rconn *swconn,
+                            long long int *send_arp_nd_time)
+    OVS_REQUIRES(pinctrl_mutex);
 static void pinctrl_handle_nd_na(struct rconn *swconn,
                                  const struct flow *ip_flow,
                                  const struct match *md,
@@ -548,6 +558,7 @@  pinctrl_init(void)
 {
     init_put_mac_bindings();
     init_send_garps_rarps();
+    init_send_arps_nds();
     init_ipv6_ras();
     init_ipv6_prefixd();
     init_buffered_packets_ctx();
@@ -3878,6 +3889,7 @@  pinctrl_handler(void *arg_)
     static long long int send_ipv6_ra_time = LLONG_MAX;
     /* Next GARP/RARP announcement in ms. */
     static long long int send_garp_rarp_time = LLONG_MAX;
+    static long long int send_arp_nd_time = LLONG_MAX;
     /* Next multicast query (IGMP) in ms. */
     static long long int send_mcast_query_time = LLONG_MAX;
     static long long int svc_monitors_next_run_time = LLONG_MAX;
@@ -3915,6 +3927,7 @@  pinctrl_handler(void *arg_)
             if (may_inject_pkts()) {
                 ovs_mutex_lock(&pinctrl_mutex);
                 send_garp_rarp_run(swconn, &send_garp_rarp_time);
+                send_arp_nd_run(swconn, &send_arp_nd_time);
                 send_ipv6_ras(swconn, &send_ipv6_ra_time);
                 send_ipv6_prefixd(swconn, &send_prefixd_time);
                 send_mac_binding_buffered_pkts(swconn);
@@ -3933,6 +3946,7 @@  pinctrl_handler(void *arg_)
         rconn_recv_wait(swconn);
         if (rconn_is_connected(swconn)) {
             send_garp_rarp_wait(send_garp_rarp_time);
+            send_arp_nd_wait(send_arp_nd_time);
             ipv6_ra_wait(send_ipv6_ra_time);
             ip_mcast_querier_wait(send_mcast_query_time);
             svc_monitors_wait(svc_monitors_next_run_time);
@@ -4019,6 +4033,7 @@  pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
             const struct sbrec_service_monitor_table *svc_mon_table,
             const struct sbrec_mac_binding_table *mac_binding_table,
             const struct sbrec_bfd_table *bfd_table,
+            const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
             const struct ovsrec_bridge *br_int,
             const struct sbrec_chassis *chassis,
             const struct hmap *local_datapaths,
@@ -4035,8 +4050,9 @@  pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
                            sbrec_port_binding_by_key, chassis);
     send_garp_rarp_prepare(ovnsb_idl_txn, sbrec_port_binding_by_datapath,
                            sbrec_port_binding_by_name,
-                           sbrec_mac_binding_by_lport_ip, br_int, chassis,
-                           local_datapaths, active_tunnels, ovs_table);
+                           sbrec_mac_binding_by_lport_ip, ecmp_nh_table,
+                           br_int, chassis, local_datapaths, active_tunnels,
+                           ovs_table);
     prepare_ipv6_ras(local_active_ports_ras, sbrec_port_binding_by_name);
     prepare_ipv6_prefixd(ovnsb_idl_txn, sbrec_port_binding_by_name,
                          local_active_ports_ipv6_pd, chassis,
@@ -4570,6 +4586,7 @@  pinctrl_destroy(void)
     latch_destroy(&pinctrl.pinctrl_thread_exit);
     rconn_destroy(pinctrl.swconn);
     destroy_send_garps_rarps();
+    destroy_send_arps_nds();
     destroy_ipv6_ras();
     destroy_ipv6_prefixd();
     destroy_buffered_packets_ctx();
@@ -5077,6 +5094,150 @@  send_garp_rarp_update(struct ovsdb_idl_txn *ovnsb_idl_txn,
     }
 }
 
+struct arp_nd_data {
+    struct hmap_node hmap_node;
+    struct eth_addr ea;          /* Ethernet address of port. */
+    struct in6_addr src_ip;      /* IP address of port. */
+    struct in6_addr dst_ip;      /* Destination IP address */
+    long long int announce_time; /* Next announcement in ms. */
+    int backoff;                 /* Backoff timeout for the next
+                                  * announcement (in msecs). */
+    uint32_t dp_key;             /* Datapath used to output this GARP. */
+    uint32_t port_key;           /* Port to inject the GARP into. */
+};
+
+static struct hmap send_arp_nd_data;
+
+static void
+init_send_arps_nds(void)
+{
+    hmap_init(&send_arp_nd_data);
+}
+
+static void
+destroy_send_arps_nds(void)
+{
+    struct arp_nd_data *e;
+    HMAP_FOR_EACH_POP (e, hmap_node, &send_arp_nd_data) {
+        free(e);
+    }
+    hmap_destroy(&send_arp_nd_data);
+}
+
+static struct arp_nd_data *
+arp_nd_find_data(const struct sbrec_port_binding *pb,
+                 const struct in6_addr *nexthop)
+{
+    uint32_t hash = 0;
+
+    hash = hash_add_in6_addr(hash, nexthop);
+    hash = hash_add(hash, pb->datapath->tunnel_key);
+    hash = hash_add(hash, pb->tunnel_key);
+
+    struct arp_nd_data *e;
+    HMAP_FOR_EACH_WITH_HASH (e, hmap_node, hash, &send_arp_nd_data) {
+        if (ipv6_addr_equals(&e->dst_ip, nexthop) &&
+            e->port_key == pb->tunnel_key) {
+            return e;
+        }
+    }
+
+    return NULL;
+}
+
+static bool
+arp_nd_find_is_stale(const struct arp_nd_data *e,
+                     const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
+                     const struct sbrec_chassis *chassis)
+{
+    const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop;
+    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) {
+        const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port;
+        if (pb->chassis != chassis) {
+            continue;
+        }
+
+        struct lport_addresses laddrs;
+        if (!extract_ip_addresses(sb_ecmp_nexthop->nexthop, &laddrs)) {
+            continue;
+        }
+
+        struct in6_addr dst_ip = laddrs.n_ipv4_addrs
+            ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr)
+            : laddrs.ipv6_addrs[0].addr;
+        destroy_lport_addresses(&laddrs);
+
+        if (pb->tunnel_key == e->port_key &&
+            pb->datapath->tunnel_key == e->dp_key &&
+            ipv6_addr_equals(&e->dst_ip, &dst_ip)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static struct arp_nd_data *
+arp_nd_alloc_data(const struct eth_addr ea,
+                  struct in6_addr src_ip, struct in6_addr dst_ip,
+                  const struct sbrec_port_binding *pb)
+{
+    struct arp_nd_data *e = xmalloc(sizeof *e);
+    e->ea = ea;
+    e->src_ip = src_ip;
+    e->dst_ip = dst_ip;
+    e->announce_time = time_msec() + 1000;
+    e->backoff = 1000; /* msec. */
+    e->dp_key = pb->datapath->tunnel_key;
+    e->port_key = pb->tunnel_key;
+
+    uint32_t hash = 0;
+    hash = hash_add_in6_addr(hash, &dst_ip);
+    hash = hash_add(hash, e->dp_key);
+    hash = hash_add(hash, e->port_key);
+    hmap_insert(&send_arp_nd_data, &e->hmap_node, hash);
+    notify_pinctrl_handler();
+
+    return e;
+}
+
+/* Add or update a vif for which ARPs need to be announced. */
+static void
+send_arp_nd_update(const struct sbrec_port_binding *pb, const char *nexthop,
+                   long long int max_arp_timeout, bool continuous_arp_nd)
+{
+    struct lport_addresses laddrs;
+    if (!extract_ip_addresses(nexthop, &laddrs)) {
+        return;
+    }
+
+    struct in6_addr dst_ip = laddrs.n_ipv4_addrs
+        ? in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr)
+        : laddrs.ipv6_addrs[0].addr;
+    destroy_lport_addresses(&laddrs);
+
+    struct arp_nd_data *e = arp_nd_find_data(pb, &dst_ip);
+    if (!e) {
+        if (!extract_lsp_addresses(pb->mac[0], &laddrs)) {
+            return;
+        }
+
+        if (laddrs.n_ipv4_addrs) {
+            arp_nd_alloc_data(laddrs.ea,
+                              in6_addr_mapped_ipv4(laddrs.ipv4_addrs[0].addr),
+                              dst_ip, pb);
+        } else if (laddrs.n_ipv6_addrs) {
+            arp_nd_alloc_data(laddrs.ea, laddrs.ipv6_addrs[0].addr,
+                              dst_ip, pb);
+        }
+        destroy_lport_addresses(&laddrs);
+    } else if (max_arp_timeout != arp_nd_max_timeout ||
+               continuous_arp_nd != arp_nd_continuous) {
+        /* reset backoff */
+        e->announce_time = time_msec() + 1000;
+        e->backoff = 1000; /* msec. */
+    }
+}
+
 /* Remove a vif from GARP announcements. */
 static void
 send_garp_rarp_delete(const char *lport)
@@ -6415,6 +6576,16 @@  send_garp_rarp_wait(long long int send_garp_rarp_time)
     }
 }
 
+static void
+send_arp_nd_wait(long long int send_arp_nd_time)
+{
+    /* Set the poll timer for next arp packet only if there is data to
+     * be sent. */
+    if (hmap_count(&send_arp_nd_data)) {
+        poll_timer_wait_until(send_arp_nd_time);
+    }
+}
+
 /* Called with in the pinctrl_handler thread context. */
 static void
 send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time)
@@ -6437,6 +6608,84 @@  send_garp_rarp_run(struct rconn *swconn, long long int *send_garp_rarp_time)
     }
 }
 
+static long long int
+send_arp_nd(struct rconn *swconn, struct arp_nd_data *e,
+            long long int current_time)
+    OVS_REQUIRES(pinctrl_mutex)
+{
+    if (current_time < e->announce_time) {
+        return e->announce_time;
+    }
+
+    /* Compose a ARP request packet. */
+    uint64_t packet_stub[128 / 8];
+    struct dp_packet packet;
+    dp_packet_use_stub(&packet, packet_stub, sizeof packet_stub);
+    if (IN6_IS_ADDR_V4MAPPED(&e->src_ip)) {
+        compose_arp(&packet, ARP_OP_REQUEST, e->ea, eth_addr_zero,
+                    true, in6_addr_get_mapped_ipv4(&e->src_ip),
+                    in6_addr_get_mapped_ipv4(&e->dst_ip));
+    } else {
+        compose_nd_ns(&packet, e->ea, &e->src_ip, &e->dst_ip);
+    }
+
+    /* Inject ARP request. */
+    uint64_t ofpacts_stub[4096 / 8];
+    struct ofpbuf ofpacts = OFPBUF_STUB_INITIALIZER(ofpacts_stub);
+    enum ofp_version version = rconn_get_version(swconn);
+    put_load(e->dp_key, MFF_LOG_DATAPATH, 0, 64, &ofpacts);
+    put_load(e->port_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
+    struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts);
+    resubmit->in_port = OFPP_CONTROLLER;
+    resubmit->table_id = OFTABLE_LOCAL_OUTPUT;
+
+    struct ofputil_packet_out po = {
+        .packet = dp_packet_data(&packet),
+        .packet_len = dp_packet_size(&packet),
+        .buffer_id = UINT32_MAX,
+        .ofpacts = ofpacts.data,
+        .ofpacts_len = ofpacts.size,
+    };
+    match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER);
+    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
+    queue_msg(swconn, ofputil_encode_packet_out(&po, proto));
+    dp_packet_uninit(&packet);
+    ofpbuf_uninit(&ofpacts);
+
+    /* Set the next announcement.  At most 5 announcements are sent for a
+     * vif if arp_nd_max_timeout is not specified otherwise cap the max
+     * timeout to arp_nd_max_timeout. */
+    if (arp_nd_continuous || e->backoff < arp_nd_max_timeout) {
+        e->announce_time = current_time + e->backoff;
+    } else {
+        e->announce_time = LLONG_MAX;
+    }
+    e->backoff = MIN(arp_nd_max_timeout, e->backoff * 2);
+
+    return e->announce_time;
+}
+
+static void
+send_arp_nd_run(struct rconn *swconn, long long int *send_arp_nd_time)
+    OVS_REQUIRES(pinctrl_mutex)
+{
+    if (!hmap_count(&send_arp_nd_data)) {
+        return;
+    }
+
+    /* Send ARPs, and update the next announcement. */
+    long long int current_time = time_msec();
+    *send_arp_nd_time = LLONG_MAX;
+
+    struct arp_nd_data *e;
+    HMAP_FOR_EACH (e, hmap_node, &send_arp_nd_data) {
+        long long int next_announce = send_arp_nd(swconn, e, current_time);
+        if (*send_arp_nd_time > next_announce) {
+            *send_arp_nd_time = next_announce;
+        }
+    }
+}
+
 /* Called by pinctrl_run(). Runs with in the main ovn-controller
  * thread context. */
 static void
@@ -6444,6 +6693,7 @@  send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
                        struct ovsdb_idl_index *sbrec_port_binding_by_datapath,
                        struct ovsdb_idl_index *sbrec_port_binding_by_name,
                        struct ovsdb_idl_index *sbrec_mac_binding_by_lport_ip,
+                       const struct sbrec_ecmp_nexthop_table *ecmp_nh_table,
                        const struct ovsrec_bridge *br_int,
                        const struct sbrec_chassis *chassis,
                        const struct hmap *local_datapaths,
@@ -6456,7 +6706,8 @@  send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
     struct sset nat_ip_keys = SSET_INITIALIZER(&nat_ip_keys);
     struct shash nat_addresses;
     unsigned long long garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
-    bool garp_continuous = false;
+    unsigned long long max_arp_nd_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
+    bool garp_continuous = false, continuous_arp_nd = true;
     const struct ovsrec_open_vswitch *cfg =
         ovsrec_open_vswitch_table_first(ovs_table);
     if (cfg) {
@@ -6466,6 +6717,11 @@  send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
         if (!garp_max_timeout) {
             garp_max_timeout = GARP_RARP_DEF_MAX_TIMEOUT;
         }
+
+        max_arp_nd_timeout = smap_get_ullong(
+                &cfg->external_ids, "arp-nd-max-timeout-sec",
+                GARP_RARP_DEF_MAX_TIMEOUT / 1000) * 1000;
+        continuous_arp_nd = !!max_arp_nd_timeout;
     }
 
     shash_init(&nat_addresses);
@@ -6479,6 +6735,7 @@  send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
                                &nat_ip_keys, &local_l3gw_ports,
                                chassis, active_tunnels,
                                &nat_addresses);
+
     /* For deleted ports and deleted nat ips, remove from
      * send_garp_rarp_data. */
     struct shash_node *iter;
@@ -6514,6 +6771,24 @@  send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
         }
     }
 
+    struct arp_nd_data *e;
+    const struct sbrec_ecmp_nexthop *sb_ecmp_nexthop;
+    HMAP_FOR_EACH_SAFE (e, hmap_node, &send_arp_nd_data) {
+        if (arp_nd_find_is_stale(e, ecmp_nh_table, chassis)) {
+            hmap_remove(&send_arp_nd_data, &e->hmap_node);
+            free(e);
+            notify_pinctrl_handler();
+        }
+    }
+
+    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sb_ecmp_nexthop, ecmp_nh_table) {
+        const struct sbrec_port_binding *pb = sb_ecmp_nexthop->port;
+        if (pb && !strcmp(pb->type, "l3gateway") && pb->chassis == chassis) {
+            send_arp_nd_update(pb, sb_ecmp_nexthop->nexthop,
+                               max_arp_nd_timeout, continuous_arp_nd);
+        }
+    }
+
     /* pinctrl_handler thread will send the GARPs. */
 
     sset_destroy(&localnet_vifs);
@@ -6531,6 +6806,9 @@  send_garp_rarp_prepare(struct ovsdb_idl_txn *ovnsb_idl_txn,
 
     garp_rarp_max_timeout = garp_max_timeout;
     garp_rarp_continuous = garp_continuous;
+
+    arp_nd_max_timeout = max_arp_nd_timeout;
+    arp_nd_continuous = continuous_arp_nd;
 }
 
 static bool
diff --git a/controller/pinctrl.h b/controller/pinctrl.h
index 846afe0a4..8459f4f53 100644
--- a/controller/pinctrl.h
+++ b/controller/pinctrl.h
@@ -36,6 +36,7 @@  struct sbrec_dns_table;
 struct sbrec_controller_event_table;
 struct sbrec_service_monitor_table;
 struct sbrec_bfd_table;
+struct sbrec_ecmp_nexthop_table;
 struct sbrec_port_binding;
 struct sbrec_mac_binding_table;
 
@@ -53,6 +54,7 @@  void pinctrl_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
                  const struct sbrec_service_monitor_table *,
                  const struct sbrec_mac_binding_table *,
                  const struct sbrec_bfd_table *,
+                 const struct sbrec_ecmp_nexthop_table *,
                  const struct ovsrec_bridge *, const struct sbrec_chassis *,
                  const struct hmap *local_datapaths,
                  const struct sset *active_tunnels,