diff mbox

[ovs-dev,v4,3/7] dpif-netdev: Register packet processing cores to KA framework.

Message ID 1503394869-76348-4-git-send-email-bhanuprakash.bodireddy@intel.com
State Superseded
Headers show

Commit Message

Bodireddy, Bhanuprakash Aug. 22, 2017, 9:41 a.m. UTC
This commit registers the packet processing PMD cores to keepalive
framework. Only PMDs that have rxqs mapped will be registered and
actively monitored by KA framework.

This commit spawns a keepalive thread that will dispatch heartbeats to
PMD cores. The pmd threads respond to heartbeats by marking themselves
alive. As long as PMD responds to heartbeats it is considered 'healthy'.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
---
 lib/dpif-netdev.c |  70 +++++++++++++++++++++++++
 lib/keepalive.c   | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 lib/keepalive.h   |  17 ++++++
 lib/util.c        |  23 ++++++++
 lib/util.h        |   2 +
 5 files changed, 254 insertions(+), 11 deletions(-)

Comments

Aaron Conole Sept. 6, 2017, 9:15 p.m. UTC | #1
Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:

> This commit registers the packet processing PMD cores to keepalive
> framework. Only PMDs that have rxqs mapped will be registered and
> actively monitored by KA framework.
>
> This commit spawns a keepalive thread that will dispatch heartbeats to
> PMD cores. The pmd threads respond to heartbeats by marking themselves
> alive. As long as PMD responds to heartbeats it is considered 'healthy'.
>
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
> ---

I'm really confused with this patch.  I've stopped reviewing the series.

It seems like there's a mix of 'track by core id' and 'track by thread
id'.

I don't think it's possible to do anything by core id.  We can never
know what else has been scheduled on those cores, and we cannot be sure
that a taskset or other scheduler provisioning call will move the
threads.

>  lib/dpif-netdev.c |  70 +++++++++++++++++++++++++
>  lib/keepalive.c   | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/keepalive.h   |  17 ++++++
>  lib/util.c        |  23 ++++++++
>  lib/util.h        |   2 +
>  5 files changed, 254 insertions(+), 11 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e2cd931..84c7ffd 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -49,6 +49,7 @@
>  #include "flow.h"
>  #include "hmapx.h"
>  #include "id-pool.h"
> +#include "keepalive.h"
>  #include "latch.h"
>  #include "netdev.h"
>  #include "netdev-vport.h"
> @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp,
>      *n = k;
>  }
>  
> +static void *
> +ovs_keepalive(void *f_ OVS_UNUSED)
> +{
> +    pthread_detach(pthread_self());
> +
> +    for (;;) {
> +        xusleep(get_ka_interval() * 1000);
> +    }
> +
> +    return NULL;
> +}
> +
> +static void
> +ka_thread_start(struct dp_netdev *dp)
> +{
> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> +
> +    if (ovsthread_once_start(&once)) {
> +        ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
> +
> +        ovsthread_once_done(&once);
> +    }
> +}
> +
> +static void
> +ka_register_datapath_threads(struct dp_netdev *dp)
> +{
> +    int ka_init = get_ka_init_status();
> +    VLOG_DBG("Keepalive: Was initialization successful? [%s]",
> +                ka_init ? "Success" : "Failure");
> +    if (!ka_init) {
> +        return;
> +    }
> +
> +    ka_thread_start(dp);
> +
> +    struct dp_netdev_pmd_thread *pmd;
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        /*  Register only PMD threads. */
> +        if (pmd->core_id != NON_PMD_CORE_ID) {
> +            int tid = ka_get_pmd_tid(pmd->core_id);
> +
> +            /* Skip PMD thread with no rxqs mapping. */

why skip these pmds?  we should still watch them, and then we can
correlated interesting events (for instance... when an rxq gets added
whats the change in utilization, etc).

> +            if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
> +                /* rxq mapping changes due to reconfiguration,
> +                 * if there are no rxqs mapped to PMD, unregister it. */
> +                ka_unregister_thread(tid, true);
> +                continue;
> +            }
> +
> +            ka_register_thread(tid, true);
> +            VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA framework",
> +                      tid, pmd->core_id);
> +        }
> +    }
> +}
> +
>  static void
>  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
>                       void *aux)
> @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>  
>      /* Reload affected pmd threads. */
>      reload_affected_pmds(dp);
> +
> +    /* Register datapath threads to KA monitoring. */
> +    ka_register_datapath_threads(dp);
>  }
>  
>  /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
> @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_)
>  
>      poll_list = NULL;
>  
> +    ka_store_pmd_id(pmd->core_id);
> +
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> @@ -3859,6 +3922,9 @@ reload:
>                                                        : PMD_CYCLES_IDLE);
>          }
>  
> +        /* Mark PMD thread alive. */
> +        ka_mark_pmd_thread_alive(pmd->core_id);
> +
>          if (lc++ > 1024) {
>              bool reload;
>  
> @@ -3892,6 +3958,10 @@ reload:
>      }
>  
>      emc_cache_uninit(&pmd->flow_cache);
> +
> +    int tid = ka_get_pmd_tid(pmd->core_id);
> +    ka_unregister_thread(tid, true);
> +
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
>      return NULL;
> diff --git a/lib/keepalive.c b/lib/keepalive.c
> index ac73a42..dfafaeb 100644
> --- a/lib/keepalive.c
> +++ b/lib/keepalive.c
> @@ -23,6 +23,7 @@
>  #include "keepalive.h"
>  #include "lib/vswitch-idl.h"
>  #include "openvswitch/vlog.h"
> +#include "process.h"
>  #include "timeval.h"
>  
>  VLOG_DEFINE_THIS_MODULE(keepalive);
> @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx)
>      return -EINVAL;
>  }
>  
> +/* Return the Keepalive timer interval. */
> +inline uint32_t

Same as before w.r.t. inline in c files.

> +get_ka_interval(void)
> +{
> +    return keepalive_timer_interval;
> +}
> +
> +int
> +get_ka_init_status(void)
> +{
> +    return ka_init_status;
> +}
> +
> +void
> +ka_store_pmd_id(unsigned core_idx)
> +{
> +    int tid = gettid();
> +
> +    if (ka_is_enabled()) {
> +        ovs_assert(tid > 0);

If someone changes pid_max to be a larger number than a 32-bit integer,
won't this assert trigger?

> +        ka_info->thread_id[core_idx] = tid;
> +    }
> +}
> +
> +/* Register thread to KA framework. */
> +void
> +ka_register_thread(int tid, bool thread_is_pmd)
> +{
> +    if (ka_is_enabled()) {
> +        struct ka_process_info *ka_pinfo;
> +        int core_num = -1;
> +        char proc_name[18] = "UNDEFINED";
> +
> +        struct process_info pinfo;
> +        int success = get_process_info(tid, &pinfo);
> +        if (success) {
> +            core_num = pinfo.core_id;
> +            ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
> +        }
> +
> +        ovs_assert(core_num >= 0);
> +
> +        uint32_t hash = hash_int(tid, 0);
> +        ovs_mutex_lock(&ka_info->proclist_mutex);
> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
> +                                 hash, &ka_info->process_list) {
> +            /* PMD thread is already registered. */
> +            if (ka_pinfo->tid == tid) {
> +                goto out;
> +            }
> +        }
> +
> +        ka_pinfo = xmalloc(sizeof *ka_pinfo);
> +        ka_pinfo->tid = tid;
> +        ka_pinfo->heartbeats = true;
> +        ka_pinfo->core_id = core_num;
> +        ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name);
> +
> +        hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
> +
> +        if (thread_is_pmd) {

Why treat these differently?  Get rid of the thread_is_pmd.

> +            ka_info->active_cores[core_num] = KA_STATE_ALIVE;
> +            ka_info->last_alive[core_num] = time_wall_msec();

As noted in the previous patch, there is no way to properly infer core
usage from this, so get rid of all the "core based" tracking.

> +            ka_info->pmd_cnt++;  /* Increment PMD count. */
> +        } else {
> +            ka_info->nonpmd_cnt++;  /* Increment non-pmd thread count. */
> +        }
> +out:
> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +    }
> +}
> +
> +/* Unregister thread from KA framework. */
> +void
> +ka_unregister_thread(int tid, bool thread_is_pmd)
> +{
> +    if (ka_is_enabled()) {
> +        struct ka_process_info *ka_pinfo;
> +
> +        int core_num = -1;
> +        struct process_info pinfo;
> +        if (get_process_info(tid, &pinfo)) {
> +            core_num = pinfo.core_id;
> +        }
> +
> +        ovs_assert(core_num >= 0);
> +
> +        ovs_mutex_lock(&ka_info->proclist_mutex);
> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
> +                                 &ka_info->process_list) {
> +            /* If PMD thread is registered, remove it from the list */
> +            if (ka_pinfo->tid == tid) {
> +                hmap_remove(&ka_info->process_list, &ka_pinfo->node);
> +                free(ka_pinfo);
> +
> +                if (thread_is_pmd) {
> +                    ka_info->active_cores[core_num] = KA_STATE_UNUSED;
> +                    ka_info->pmd_cnt--;  /* Decrement PMD count. */
> +                } else {
> +                    ka_info->nonpmd_cnt--;  /* Decrement non-pmd thread cnt. */
> +                }
> +
> +                break;
> +            }
> +        }
> +
> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +    }
> +}
> +
> +/* Mark packet processing core alive. */
> +void
> +ka_mark_pmd_thread_alive(unsigned core_id)
> +{
> +    if (ka_is_enabled()) {
> +        ka_info->state_flags[core_id] = KA_STATE_ALIVE;

shouldn't this be done by tid not coreid?

> +    }
> +}
> +
> +/* Mark packet processing core as idle. */
> +inline void

And again.

> +ka_mark_pmd_thread_sleep(unsigned core_id)
> +{
> +    if (ka_is_enabled()) {
> +        ka_info->state_flags[core_id] = KA_STATE_DOZING;
> +    }
> +}
> +
>  void
>  ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
>                      uint64_t last_alive)
> @@ -166,18 +295,20 @@ ka_init(const struct smap *ovs_other_config)

This whole diff section is inappropriate.  Please don't add code in one
patch and then remove it in the next.

>  void
>  ka_destroy(void)
>  {
> -    if (ka_info) {
> -        struct ka_process_info *pinfo;
> -
> -        ovs_mutex_lock(&ka_info->proclist_mutex);
> -        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> -            free(pinfo);
> -        }
> -        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +    if (!ka_info) {
> +        return;
> +    }
>  
> -        hmap_destroy(&ka_info->process_list);
> -        ovs_mutex_destroy(&ka_info->proclist_mutex);
> +    struct ka_process_info *pinfo;
>  
> -        free(ka_info);
> +    ovs_mutex_lock(&ka_info->proclist_mutex);
> +    HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> +        free(pinfo);
>      }
> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
> +
> +    hmap_destroy(&ka_info->process_list);
> +    ovs_mutex_destroy(&ka_info->proclist_mutex);
> +
> +    free(ka_info);
>  }
> diff --git a/lib/keepalive.h b/lib/keepalive.h
> index d133398..607ee3b 100644
> --- a/lib/keepalive.h
> +++ b/lib/keepalive.h
> @@ -38,8 +38,10 @@ enum keepalive_state {
>  typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
>  
>  struct ka_process_info {
> +    char name[16];
>      int tid;
>      int core_id;
> +    bool heartbeats;
>      enum keepalive_state core_state;
>      uint64_t core_last_seen_times;
>      struct hmap_node node;
> @@ -52,6 +54,10 @@ struct keepalive_info {
>      /* List of process/threads monitored by KA framework. */
>      struct hmap process_list OVS_GUARDED;
>  
> +    /* count of threads registered to KA framework. */
> +    uint32_t pmd_cnt;
> +    uint32_t nonpmd_cnt;
> +

What are these counts for?

>      /* Store Datapath threads 'tid'.
>       * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
>      pid_t thread_id[KA_DP_MAXCORES];
> @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum keepalive_state,
>  
>  int ka_get_pmd_tid(unsigned core);
>  bool ka_is_enabled(void);
> +void ka_register_thread(int, bool);
> +void ka_unregister_thread(int, bool);
> +void ka_mark_pmd_thread_alive(unsigned);
> +void ka_mark_pmd_thread_sleep(unsigned);
> +
> +void ka_store_pmd_id(unsigned core);
> +uint32_t get_ka_interval(void);
> +int get_ka_init_status(void);
> +int ka_alloc_portstats(unsigned, int);
> +void ka_destroy_portstats(void);
> +
>  #endif /* keepalive.h */
> diff --git a/lib/util.c b/lib/util.c
> index 36e3731..83f7e0a 100644
> --- a/lib/util.c
> +++ b/lib/util.c
> @@ -26,6 +26,9 @@
>  #include <stdlib.h>
>  #include <string.h>
>  #include <sys/stat.h>
> +#ifdef __linux__
> +#include <sys/syscall.h>
> +#endif
>  #include <unistd.h>
>  #include "bitmap.h"
>  #include "byte-order.h"
> @@ -568,6 +571,16 @@ get_page_size(void)
>      return cached;
>  }
>  
> +int
> +gettid(void)
> +{
> +#ifdef __linux__
> +    return (int)syscall(SYS_gettid);

This returns a pid_t type.  Don't cast it to int, please.

Also, call it ovs_gettid()

> +#endif
> +
> +    return -EINVAL;
> +}
> +
>  /* Returns the time at which the system booted, as the number of milliseconds
>   * since the epoch, or 0 if the time of boot cannot be determined. */
>  long long int
> @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds)
>      ovsrcu_quiesce_end();
>  }
>  
> +void
> +xusleep(unsigned int microseconds)
> +{
> +    ovsrcu_quiesce_start();
> +#ifdef __linux__
> +    usleep(microseconds);

I commented before on this not being put in a generic place since
there's no equivalent.

Further, according to the manpages:

POSIX.1-2001  declares this function obsolete;
use nanosleep(2) instead.  POSIX.1-2008 removes  the  specification  of
usleep().

Please don't expose this through utils - ONLY linux will provide it, and
it's possible at some future point that glibc removes it.

> +#endif
> +    ovsrcu_quiesce_end();
> +}
> +
>  /* Determine whether standard output is a tty or not. This is useful to decide
>   * whether to use color output or not when --color option for utilities is set
>   * to `auto`.
> diff --git a/lib/util.h b/lib/util.h
> index 764e0a0..f3cb61a 100644
> --- a/lib/util.h
> +++ b/lib/util.h
> @@ -120,6 +120,7 @@ const char *get_subprogram_name(void);
>  
>  unsigned int get_page_size(void);
>  long long int get_boot_time(void);
> +int gettid(void);
>  
>  void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
>  
> @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
>  }
>  
>  void xsleep(unsigned int seconds);
> +void xusleep(unsigned int microseconds);
>  
>  bool is_stdout_a_tty(void);
Bodireddy, Bhanuprakash Sept. 7, 2017, 3:36 p.m. UTC | #2
>Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:
>
>> This commit registers the packet processing PMD cores to keepalive
>> framework. Only PMDs that have rxqs mapped will be registered and
>> actively monitored by KA framework.
>>
>> This commit spawns a keepalive thread that will dispatch heartbeats to
>> PMD cores. The pmd threads respond to heartbeats by marking themselves
>> alive. As long as PMD responds to heartbeats it is considered 'healthy'.
>>
>> Signed-off-by: Bhanuprakash Bodireddy
>> <bhanuprakash.bodireddy@intel.com>
>> ---
>
>I'm really confused with this patch.  I've stopped reviewing the series.
>
>It seems like there's a mix of 'track by core id' and 'track by thread id'.
>
>I don't think it's possible to do anything by core id.  We can never know what
>else has been scheduled on those cores, and we cannot be sure that a taskset
>or other scheduler provisioning call will move the threads.

[BHANU] I have already answered this in other thread. 
one can't correlate threads with cores and we shouldn't be tracking by cores. However with PMD threads 
there is 1:1 mapping of PMD and the core-id and it's safe to temporarily write PMD liveness info in to an array indexed
by core id before updating this in to HMAP. 

However as already mentioned, we are using tid for all other purposes as it is unique across the system.

>
>>  lib/dpif-netdev.c |  70 +++++++++++++++++++++++++
>>  lib/keepalive.c   | 153
>++++++++++++++++++++++++++++++++++++++++++++++++++----
>>  lib/keepalive.h   |  17 ++++++
>>  lib/util.c        |  23 ++++++++
>>  lib/util.h        |   2 +
>>  5 files changed, 254 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
>> e2cd931..84c7ffd 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -49,6 +49,7 @@
>>  #include "flow.h"
>>  #include "hmapx.h"
>>  #include "id-pool.h"
>> +#include "keepalive.h"
>>  #include "latch.h"
>>  #include "netdev.h"
>>  #include "netdev-vport.h"
>> @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp,
>>      *n = k;
>>  }
>>
>> +static void *
>> +ovs_keepalive(void *f_ OVS_UNUSED)
>> +{
>> +    pthread_detach(pthread_self());
>> +
>> +    for (;;) {
>> +        xusleep(get_ka_interval() * 1000);
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static void
>> +ka_thread_start(struct dp_netdev *dp) {
>> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
>> +
>> +    if (ovsthread_once_start(&once)) {
>> +        ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
>> +
>> +        ovsthread_once_done(&once);
>> +    }
>> +}
>> +
>> +static void
>> +ka_register_datapath_threads(struct dp_netdev *dp) {
>> +    int ka_init = get_ka_init_status();
>> +    VLOG_DBG("Keepalive: Was initialization successful? [%s]",
>> +                ka_init ? "Success" : "Failure");
>> +    if (!ka_init) {
>> +        return;
>> +    }
>> +
>> +    ka_thread_start(dp);
>> +
>> +    struct dp_netdev_pmd_thread *pmd;
>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> +        /*  Register only PMD threads. */
>> +        if (pmd->core_id != NON_PMD_CORE_ID) {
>> +            int tid = ka_get_pmd_tid(pmd->core_id);
>> +
>> +            /* Skip PMD thread with no rxqs mapping. */
>
>why skip these pmds?  we should still watch them, and then we can
>correlated interesting events (for instance... when an rxq gets added whats
>the change in utilization, etc).

[BHANU]  We shoud skip the PMDs that has no rxqs mapped. This would happen in cases
where there are more PMD threads than the number of rxqs. 

More importantly a PMD thread with no mapped rxq will not even enter the receive loop and
will be in sleep state as below. 

$ ps -eLo tid,psr,comm,state | grep pmd
 51727   3 pmd61           R
 51747   0 pmd62           S
 51749   1 pmd63           S
 51750   2 pmd64           R
 51756   6 pmd65           S
 51758   7 pmd66           R
 51759   4 pmd67           R
 51760   5 pmd68           S

When an rxq gets added to a sleeping PMD thread, the datapath reconfiguration happens,
this time threads get registered to KA framework as below.

CP:  reconfigure_datapath() -> ka_register_datapath_threads() -> ka_register_thread().

>
>> +            if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
>> +                /* rxq mapping changes due to reconfiguration,
>> +                 * if there are no rxqs mapped to PMD, unregister it. */
>> +                ka_unregister_thread(tid, true);
>> +                continue;
>> +            }
>> +
>> +            ka_register_thread(tid, true);
>> +            VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA
>framework",
>> +                      tid, pmd->core_id);
>> +        }
>> +    }
>> +}
>> +
>>  static void
>>  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char
>*argv[],
>>                       void *aux)
>> @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>>
>>      /* Reload affected pmd threads. */
>>      reload_affected_pmds(dp);
>> +
>> +    /* Register datapath threads to KA monitoring. */
>> +    ka_register_datapath_threads(dp);
>>  }
>>
>>  /* Returns true if one of the netdevs in 'dp' requires a
>> reconfiguration */ @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_)
>>
>>      poll_list = NULL;
>>
>> +    ka_store_pmd_id(pmd->core_id);
>> +
>>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>>      ovs_numa_thread_setaffinity_core(pmd->core_id);
>> @@ -3859,6 +3922,9 @@ reload:
>>                                                        : PMD_CYCLES_IDLE);
>>          }
>>
>> +        /* Mark PMD thread alive. */
>> +        ka_mark_pmd_thread_alive(pmd->core_id);
>> +
>>          if (lc++ > 1024) {
>>              bool reload;
>>
>> @@ -3892,6 +3958,10 @@ reload:
>>      }
>>
>>      emc_cache_uninit(&pmd->flow_cache);
>> +
>> +    int tid = ka_get_pmd_tid(pmd->core_id);
>> +    ka_unregister_thread(tid, true);
>> +
>>      free(poll_list);
>>      pmd_free_cached_ports(pmd);
>>      return NULL;
>> diff --git a/lib/keepalive.c b/lib/keepalive.c index ac73a42..dfafaeb
>> 100644
>> --- a/lib/keepalive.c
>> +++ b/lib/keepalive.c
>> @@ -23,6 +23,7 @@
>>  #include "keepalive.h"
>>  #include "lib/vswitch-idl.h"
>>  #include "openvswitch/vlog.h"
>> +#include "process.h"
>>  #include "timeval.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(keepalive);
>> @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx)
>>      return -EINVAL;
>>  }
>>
>> +/* Return the Keepalive timer interval. */ inline uint32_t
>
>Same as before w.r.t. inline in c files.

[BHANU]  OK.

>
>> +get_ka_interval(void)
>> +{
>> +    return keepalive_timer_interval;
>> +}
>> +
>> +int
>> +get_ka_init_status(void)
>> +{
>> +    return ka_init_status;
>> +}
>> +
>> +void
>> +ka_store_pmd_id(unsigned core_idx)
>> +{
>> +    int tid = gettid();
>> +
>> +    if (ka_is_enabled()) {
>> +        ovs_assert(tid > 0);
>
>If someone changes pid_max to be a larger number than a 32-bit integer,
>won't this assert trigger?

[BHANU]  With my search skills, I see that PID_MAX_LIMIX can be set to 4 million, so this shouldn't happen.

-----------------------"include/linux/threads.h"--------------------------------------
#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)

/*
 * A maximum of 4 million PIDs should be enough for a while.
 * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
 */
#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
        (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
----------------------------------------------------------------------------------------------

Man page also has some reference to PIX_MAX_LIMIT and confirms the above finding:
http://man7.org/linux/man-pages/man5/proc.5.html


>
>> +        ka_info->thread_id[core_idx] = tid;
>> +    }
>> +}
>> +
>> +/* Register thread to KA framework. */ void ka_register_thread(int
>> +tid, bool thread_is_pmd) {
>> +    if (ka_is_enabled()) {
>> +        struct ka_process_info *ka_pinfo;
>> +        int core_num = -1;
>> +        char proc_name[18] = "UNDEFINED";
>> +
>> +        struct process_info pinfo;
>> +        int success = get_process_info(tid, &pinfo);
>> +        if (success) {
>> +            core_num = pinfo.core_id;
>> +            ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
>> +        }
>> +
>> +        ovs_assert(core_num >= 0);
>> +
>> +        uint32_t hash = hash_int(tid, 0);
>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
>> +                                 hash, &ka_info->process_list) {
>> +            /* PMD thread is already registered. */
>> +            if (ka_pinfo->tid == tid) {
>> +                goto out;
>> +            }
>> +        }
>> +
>> +        ka_pinfo = xmalloc(sizeof *ka_pinfo);
>> +        ka_pinfo->tid = tid;
>> +        ka_pinfo->heartbeats = true;
>> +        ka_pinfo->core_id = core_num;
>> +        ovs_strlcpy(ka_pinfo->name, proc_name, sizeof
>> + ka_pinfo->name);
>> +
>> +        hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
>> +
>> +        if (thread_is_pmd) {
>
>Why treat these differently?  Get rid of the thread_is_pmd.

[BHANU]
  The API is named as ka_register_thread(), meaning it can register any thread be it PMD or non-PMD.
   In case of PMD thread, thread_is_pmd is set, and uses core tracking and increments pmd_cnt for bookkeeping. 

   In the future, if we implement registering of non-pmd threads this API can be called with thread_is_pmd == false.

>
>> +            ka_info->active_cores[core_num] = KA_STATE_ALIVE;
>> +            ka_info->last_alive[core_num] = time_wall_msec();
>
>As noted in the previous patch, there is no way to properly infer core usage
>from this, so get rid of all the "core based" tracking.
>
>> +            ka_info->pmd_cnt++;  /* Increment PMD count. */
>> +        } else {
>> +            ka_info->nonpmd_cnt++;  /* Increment non-pmd thread count. */
>> +        }
>> +out:
>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +    }
>> +}
>> +
>> +/* Unregister thread from KA framework. */ void
>> +ka_unregister_thread(int tid, bool thread_is_pmd) {
>> +    if (ka_is_enabled()) {
>> +        struct ka_process_info *ka_pinfo;
>> +
>> +        int core_num = -1;
>> +        struct process_info pinfo;
>> +        if (get_process_info(tid, &pinfo)) {
>> +            core_num = pinfo.core_id;
>> +        }
>> +
>> +        ovs_assert(core_num >= 0);
>> +
>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
>> +                                 &ka_info->process_list) {
>> +            /* If PMD thread is registered, remove it from the list */
>> +            if (ka_pinfo->tid == tid) {
>> +                hmap_remove(&ka_info->process_list, &ka_pinfo->node);
>> +                free(ka_pinfo);
>> +
>> +                if (thread_is_pmd) {
>> +                    ka_info->active_cores[core_num] = KA_STATE_UNUSED;
>> +                    ka_info->pmd_cnt--;  /* Decrement PMD count. */
>> +                } else {
>> +                    ka_info->nonpmd_cnt--;  /* Decrement non-pmd thread cnt. */
>> +                }
>> +
>> +                break;
>> +            }
>> +        }
>> +
>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +    }
>> +}
>> +
>> +/* Mark packet processing core alive. */ void
>> +ka_mark_pmd_thread_alive(unsigned core_id) {
>> +    if (ka_is_enabled()) {
>> +        ka_info->state_flags[core_id] = KA_STATE_ALIVE;
>
>shouldn't this be done by tid not coreid?

[BHANU]  The API is used by PMD threads and PMD thread pinned to *core_id* is marking itself ALIVE in packet processing loop.

>
>> +    }
>> +}
>> +
>> +/* Mark packet processing core as idle. */ inline void
>
>And again.
>
>> +ka_mark_pmd_thread_sleep(unsigned core_id) {
>> +    if (ka_is_enabled()) {
>> +        ka_info->state_flags[core_id] = KA_STATE_DOZING;
>> +    }
>> +}
>> +
>>  void
>>  ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
>>                      uint64_t last_alive) @@ -166,18 +295,20 @@
>> ka_init(const struct smap *ovs_other_config)
>
>This whole diff section is inappropriate.  Please don't add code in one patch
>and then remove it in the next.

[BHANU]  Its bit of realignment. I will fix it anyways.

>
>>  void
>>  ka_destroy(void)
>>  {
>> -    if (ka_info) {
>> -        struct ka_process_info *pinfo;
>> -
>> -        ovs_mutex_lock(&ka_info->proclist_mutex);
>> -        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>> -            free(pinfo);
>> -        }
>> -        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +    if (!ka_info) {
>> +        return;
>> +    }
>>
>> -        hmap_destroy(&ka_info->process_list);
>> -        ovs_mutex_destroy(&ka_info->proclist_mutex);
>> +    struct ka_process_info *pinfo;
>>
>> -        free(ka_info);
>> +    ovs_mutex_lock(&ka_info->proclist_mutex);
>> +    HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>> +        free(pinfo);
>>      }
>> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +
>> +    hmap_destroy(&ka_info->process_list);
>> +    ovs_mutex_destroy(&ka_info->proclist_mutex);
>> +
>> +    free(ka_info);
>>  }
>> diff --git a/lib/keepalive.h b/lib/keepalive.h index d133398..607ee3b
>> 100644
>> --- a/lib/keepalive.h
>> +++ b/lib/keepalive.h
>> @@ -38,8 +38,10 @@ enum keepalive_state {  typedef void
>> (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
>>
>>  struct ka_process_info {
>> +    char name[16];
>>      int tid;
>>      int core_id;
>> +    bool heartbeats;
>>      enum keepalive_state core_state;
>>      uint64_t core_last_seen_times;
>>      struct hmap_node node;
>> @@ -52,6 +54,10 @@ struct keepalive_info {
>>      /* List of process/threads monitored by KA framework. */
>>      struct hmap process_list OVS_GUARDED;
>>
>> +    /* count of threads registered to KA framework. */
>> +    uint32_t pmd_cnt;
>> +    uint32_t nonpmd_cnt;
>> +
>
>What are these counts for?

[BHANU]  This is to track the number of PMD threads and non PMD threads on compute and displayed with 'Keepalive/pmd-health-show'

>
>>      /* Store Datapath threads 'tid'.
>>       * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
>>      pid_t thread_id[KA_DP_MAXCORES];
>> @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum
>> keepalive_state,
>>
>>  int ka_get_pmd_tid(unsigned core);
>>  bool ka_is_enabled(void);
>> +void ka_register_thread(int, bool);
>> +void ka_unregister_thread(int, bool); void
>> +ka_mark_pmd_thread_alive(unsigned);
>> +void ka_mark_pmd_thread_sleep(unsigned);
>> +
>> +void ka_store_pmd_id(unsigned core);
>> +uint32_t get_ka_interval(void);
>> +int get_ka_init_status(void);
>> +int ka_alloc_portstats(unsigned, int); void
>> +ka_destroy_portstats(void);
>> +
>>  #endif /* keepalive.h */
>> diff --git a/lib/util.c b/lib/util.c
>> index 36e3731..83f7e0a 100644
>> --- a/lib/util.c
>> +++ b/lib/util.c
>> @@ -26,6 +26,9 @@
>>  #include <stdlib.h>
>>  #include <string.h>
>>  #include <sys/stat.h>
>> +#ifdef __linux__
>> +#include <sys/syscall.h>
>> +#endif
>>  #include <unistd.h>
>>  #include "bitmap.h"
>>  #include "byte-order.h"
>> @@ -568,6 +571,16 @@ get_page_size(void)
>>      return cached;
>>  }
>>
>> +int
>> +gettid(void)
>> +{
>> +#ifdef __linux__
>> +    return (int)syscall(SYS_gettid);
>
>This returns a pid_t type.  Don't cast it to int, please.
>
>Also, call it ovs_gettid()

[BHANU] OK.

>
>> +#endif
>> +
>> +    return -EINVAL;
>> +}
>> +
>>  /* Returns the time at which the system booted, as the number of
>milliseconds
>>   * since the epoch, or 0 if the time of boot cannot be determined. */
>> long long int @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds)
>>      ovsrcu_quiesce_end();
>>  }
>>
>> +void
>> +xusleep(unsigned int microseconds)
>> +{
>> +    ovsrcu_quiesce_start();
>> +#ifdef __linux__
>> +    usleep(microseconds);
>
>I commented before on this not being put in a generic place since there's no
>equivalent.
>
>Further, according to the manpages:
>
>POSIX.1-2001  declares this function obsolete; use nanosleep(2) instead.
>POSIX.1-2008 removes  the  specification  of usleep().
>
>Please don't expose this through utils - ONLY linux will provide it, and it's
>possible at some future point that glibc removes it.
>
[BHANU] Can you suggest suitable place for this API?


>> +#endif
>> +    ovsrcu_quiesce_end();
>> +}
>> +
>>  /* Determine whether standard output is a tty or not. This is useful to
>decide
>>   * whether to use color output or not when --color option for utilities is set
>>   * to `auto`.
>> diff --git a/lib/util.h b/lib/util.h
>> index 764e0a0..f3cb61a 100644
>> --- a/lib/util.h
>> +++ b/lib/util.h
>> @@ -120,6 +120,7 @@ const char *get_subprogram_name(void);
>>
>>  unsigned int get_page_size(void);
>>  long long int get_boot_time(void);
>> +int gettid(void);
>>
>>  void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
>>
>> @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
>> }
>>
>>  void xsleep(unsigned int seconds);
>> +void xusleep(unsigned int microseconds);
>>
>>  bool is_stdout_a_tty(void);
Aaron Conole Sept. 8, 2017, 4:53 p.m. UTC | #3
"Bodireddy, Bhanuprakash" <bhanuprakash.bodireddy@intel.com> writes:

>>Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:
>>
>>> This commit registers the packet processing PMD cores to keepalive
>>> framework. Only PMDs that have rxqs mapped will be registered and
>>> actively monitored by KA framework.
>>>
>>> This commit spawns a keepalive thread that will dispatch heartbeats to
>>> PMD cores. The pmd threads respond to heartbeats by marking themselves
>>> alive. As long as PMD responds to heartbeats it is considered 'healthy'.
>>>
>>> Signed-off-by: Bhanuprakash Bodireddy
>>> <bhanuprakash.bodireddy@intel.com>
>>> ---
>>
>>I'm really confused with this patch.  I've stopped reviewing the series.
>>
>>It seems like there's a mix of 'track by core id' and 'track by thread id'.
>>
>>I don't think it's possible to do anything by core id.  We can never know what
>>else has been scheduled on those cores, and we cannot be sure that a taskset
>>or other scheduler provisioning call will move the threads.
>
> [BHANU] I have already answered this in other thread. 
> one can't correlate threads with cores and we shouldn't be tracking by
> cores. However with PMD threads
> there is 1:1 mapping of PMD and the core-id and it's safe to
> temporarily write PMD liveness info in to an array indexed
> by core id before updating this in to HMAP. 

The core-id as a concept here is deceptive.  An external entity (such as
taskset) can rebalance the PMDs.  External entities can be scheduled on
the cores.  I think it's dangerous to have anything called core-id
in this series or feature, because people will naturally infer things
which aren't true.  Additionally, it will lead to things like "well, we
know that core x,y,z are running at A%, so we can schedule things
thusly..."

Makes sense?

> However as already mentioned, we are using tid for all other purposes
> as it is unique across the system.
>
>>
>>>  lib/dpif-netdev.c |  70 +++++++++++++++++++++++++
>>>  lib/keepalive.c   | 153
>>++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>  lib/keepalive.h   |  17 ++++++
>>>  lib/util.c        |  23 ++++++++
>>>  lib/util.h        |   2 +
>>>  5 files changed, 254 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
>>> e2cd931..84c7ffd 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -49,6 +49,7 @@
>>>  #include "flow.h"
>>>  #include "hmapx.h"
>>>  #include "id-pool.h"
>>> +#include "keepalive.h"
>>>  #include "latch.h"
>>>  #include "netdev.h"
>>>  #include "netdev-vport.h"
>>> @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp,
>>>      *n = k;
>>>  }
>>>
>>> +static void *
>>> +ovs_keepalive(void *f_ OVS_UNUSED)
>>> +{
>>> +    pthread_detach(pthread_self());
>>> +
>>> +    for (;;) {
>>> +        xusleep(get_ka_interval() * 1000);
>>> +    }
>>> +
>>> +    return NULL;
>>> +}
>>> +
>>> +static void
>>> +ka_thread_start(struct dp_netdev *dp) {
>>> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
>>> +
>>> +    if (ovsthread_once_start(&once)) {
>>> +        ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
>>> +
>>> +        ovsthread_once_done(&once);
>>> +    }
>>> +}
>>> +
>>> +static void
>>> +ka_register_datapath_threads(struct dp_netdev *dp) {
>>> +    int ka_init = get_ka_init_status();
>>> +    VLOG_DBG("Keepalive: Was initialization successful? [%s]",
>>> +                ka_init ? "Success" : "Failure");
>>> +    if (!ka_init) {
>>> +        return;
>>> +    }
>>> +
>>> +    ka_thread_start(dp);
>>> +
>>> +    struct dp_netdev_pmd_thread *pmd;
>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +        /*  Register only PMD threads. */
>>> +        if (pmd->core_id != NON_PMD_CORE_ID) {
>>> +            int tid = ka_get_pmd_tid(pmd->core_id);
>>> +
>>> +            /* Skip PMD thread with no rxqs mapping. */
>>
>>why skip these pmds?  we should still watch them, and then we can
>>correlated interesting events (for instance... when an rxq gets added whats
>>the change in utilization, etc).
>
> [BHANU]  We shoud skip the PMDs that has no rxqs mapped. This would happen in cases
> where there are more PMD threads than the number of rxqs. 
>
> More importantly a PMD thread with no mapped rxq will not even enter
> the receive loop and
> will be in sleep state as below. 

Sure - that's okay.  It's consistent.

> $ ps -eLo tid,psr,comm,state | grep pmd
>  51727   3 pmd61           R
>  51747   0 pmd62           S
>  51749   1 pmd63           S
>  51750   2 pmd64           R
>  51756   6 pmd65           S
>  51758   7 pmd66           R
>  51759   4 pmd67           R
>  51760   5 pmd68           S
>
> When an rxq gets added to a sleeping PMD thread, the datapath reconfiguration happens,
> this time threads get registered to KA framework as below.
>
> CP:  reconfigure_datapath() -> ka_register_datapath_threads() -> ka_register_thread().
>
>>
>>> +            if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
>>> +                /* rxq mapping changes due to reconfiguration,
>>> +                 * if there are no rxqs mapped to PMD, unregister it. */
>>> +                ka_unregister_thread(tid, true);
>>> +                continue;
>>> +            }
>>> +
>>> +            ka_register_thread(tid, true);
>>> +            VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA
>>framework",
>>> +                      tid, pmd->core_id);
>>> +        }
>>> +    }
>>> +}
>>> +
>>>  static void
>>>  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char
>>*argv[],
>>>                       void *aux)
>>> @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>>>
>>>      /* Reload affected pmd threads. */
>>>      reload_affected_pmds(dp);
>>> +
>>> +    /* Register datapath threads to KA monitoring. */
>>> +    ka_register_datapath_threads(dp);
>>>  }
>>>
>>>  /* Returns true if one of the netdevs in 'dp' requires a
>>> reconfiguration */ @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_)
>>>
>>>      poll_list = NULL;
>>>
>>> +    ka_store_pmd_id(pmd->core_id);
>>> +
>>>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>>>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>>>      ovs_numa_thread_setaffinity_core(pmd->core_id);
>>> @@ -3859,6 +3922,9 @@ reload:
>>>                                                        : PMD_CYCLES_IDLE);
>>>          }
>>>
>>> +        /* Mark PMD thread alive. */
>>> +        ka_mark_pmd_thread_alive(pmd->core_id);
>>> +
>>>          if (lc++ > 1024) {
>>>              bool reload;
>>>
>>> @@ -3892,6 +3958,10 @@ reload:
>>>      }
>>>
>>>      emc_cache_uninit(&pmd->flow_cache);
>>> +
>>> +    int tid = ka_get_pmd_tid(pmd->core_id);
>>> +    ka_unregister_thread(tid, true);
>>> +
>>>      free(poll_list);
>>>      pmd_free_cached_ports(pmd);
>>>      return NULL;
>>> diff --git a/lib/keepalive.c b/lib/keepalive.c index ac73a42..dfafaeb
>>> 100644
>>> --- a/lib/keepalive.c
>>> +++ b/lib/keepalive.c
>>> @@ -23,6 +23,7 @@
>>>  #include "keepalive.h"
>>>  #include "lib/vswitch-idl.h"
>>>  #include "openvswitch/vlog.h"
>>> +#include "process.h"
>>>  #include "timeval.h"
>>>
>>>  VLOG_DEFINE_THIS_MODULE(keepalive);
>>> @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx)
>>>      return -EINVAL;
>>>  }
>>>
>>> +/* Return the Keepalive timer interval. */ inline uint32_t
>>
>>Same as before w.r.t. inline in c files.
>
> [BHANU]  OK.
>
>>
>>> +get_ka_interval(void)
>>> +{
>>> +    return keepalive_timer_interval;
>>> +}
>>> +
>>> +int
>>> +get_ka_init_status(void)
>>> +{
>>> +    return ka_init_status;
>>> +}
>>> +
>>> +void
>>> +ka_store_pmd_id(unsigned core_idx)
>>> +{
>>> +    int tid = gettid();
>>> +
>>> +    if (ka_is_enabled()) {
>>> +        ovs_assert(tid > 0);
>>
>>If someone changes pid_max to be a larger number than a 32-bit integer,
>>won't this assert trigger?
>
> [BHANU]  With my search skills, I see that PID_MAX_LIMIX can be set to 4 million, so this shouldn't happen.
>
> -----------------------"include/linux/threads.h"--------------------------------------
> #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
>
> /*
>  * A maximum of 4 million PIDs should be enough for a while.
>  * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
>  */
> #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
>         (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
> ----------------------------------------------------------------------------------------------
>
> Man page also has some reference to PIX_MAX_LIMIT and confirms the above finding:
> http://man7.org/linux/man-pages/man5/proc.5.html

Cool; I wasn't aware that it was bounded to a max.

>>
>>> +        ka_info->thread_id[core_idx] = tid;
>>> +    }
>>> +}
>>> +
>>> +/* Register thread to KA framework. */ void ka_register_thread(int
>>> +tid, bool thread_is_pmd) {
>>> +    if (ka_is_enabled()) {
>>> +        struct ka_process_info *ka_pinfo;
>>> +        int core_num = -1;
>>> +        char proc_name[18] = "UNDEFINED";
>>> +
>>> +        struct process_info pinfo;
>>> +        int success = get_process_info(tid, &pinfo);
>>> +        if (success) {
>>> +            core_num = pinfo.core_id;
>>> +            ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
>>> +        }
>>> +
>>> +        ovs_assert(core_num >= 0);
>>> +
>>> +        uint32_t hash = hash_int(tid, 0);
>>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>>> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
>>> +                                 hash, &ka_info->process_list) {
>>> +            /* PMD thread is already registered. */
>>> +            if (ka_pinfo->tid == tid) {
>>> +                goto out;
>>> +            }
>>> +        }
>>> +
>>> +        ka_pinfo = xmalloc(sizeof *ka_pinfo);
>>> +        ka_pinfo->tid = tid;
>>> +        ka_pinfo->heartbeats = true;
>>> +        ka_pinfo->core_id = core_num;
>>> +        ovs_strlcpy(ka_pinfo->name, proc_name, sizeof
>>> + ka_pinfo->name);
>>> +
>>> +        hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
>>> +
>>> +        if (thread_is_pmd) {
>>
>>Why treat these differently?  Get rid of the thread_is_pmd.
>
> [BHANU]
>   The API is named as ka_register_thread(), meaning it can register any thread be it PMD or non-PMD.
>    In case of PMD thread, thread_is_pmd is set, and uses core tracking and increments pmd_cnt for bookkeeping. 
>
>    In the future, if we implement registering of non-pmd threads this API can be called with thread_is_pmd == false.

I'm saying that distinction isn't useful for the purposes of
status reports.  We can tell a thread is a PMD by it's name if we really
care.

>>
>>> +            ka_info->active_cores[core_num] = KA_STATE_ALIVE;
>>> +            ka_info->last_alive[core_num] = time_wall_msec();
>>
>>As noted in the previous patch, there is no way to properly infer core usage
>>from this, so get rid of all the "core based" tracking.
>>
>>> +            ka_info->pmd_cnt++;  /* Increment PMD count. */
>>> +        } else {
>>> +            ka_info->nonpmd_cnt++;  /* Increment non-pmd thread count. */
>>> +        }
>>> +out:
>>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>>> +    }
>>> +}
>>> +
>>> +/* Unregister thread from KA framework. */ void
>>> +ka_unregister_thread(int tid, bool thread_is_pmd) {
>>> +    if (ka_is_enabled()) {
>>> +        struct ka_process_info *ka_pinfo;
>>> +
>>> +        int core_num = -1;
>>> +        struct process_info pinfo;
>>> +        if (get_process_info(tid, &pinfo)) {
>>> +            core_num = pinfo.core_id;
>>> +        }
>>> +
>>> +        ovs_assert(core_num >= 0);
>>> +
>>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>>> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
>>> +                                 &ka_info->process_list) {
>>> +            /* If PMD thread is registered, remove it from the list */
>>> +            if (ka_pinfo->tid == tid) {
>>> +                hmap_remove(&ka_info->process_list, &ka_pinfo->node);
>>> +                free(ka_pinfo);
>>> +
>>> +                if (thread_is_pmd) {
>>> +                    ka_info->active_cores[core_num] = KA_STATE_UNUSED;
>>> +                    ka_info->pmd_cnt--;  /* Decrement PMD count. */
>>> +                } else {
>>> +                    ka_info->nonpmd_cnt--;  /* Decrement non-pmd thread cnt. */
>>> +                }
>>> +
>>> +                break;
>>> +            }
>>> +        }
>>> +
>>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>>> +    }
>>> +}
>>> +
>>> +/* Mark packet processing core alive. */ void
>>> +ka_mark_pmd_thread_alive(unsigned core_id) {
>>> +    if (ka_is_enabled()) {
>>> +        ka_info->state_flags[core_id] = KA_STATE_ALIVE;
>>
>>shouldn't this be done by tid not coreid?
>
> [BHANU] The API is used by PMD threads and PMD thread pinned to
> *core_id* is marking itself ALIVE in packet processing loop.

See previous about coreid.  Since it's never re-checked, and an external
entity can make the actual value change, it's not useful.

>>
>>> +    }
>>> +}
>>> +
>>> +/* Mark packet processing core as idle. */ inline void
>>
>>And again.
>>
>>> +ka_mark_pmd_thread_sleep(unsigned core_id) {
>>> +    if (ka_is_enabled()) {
>>> +        ka_info->state_flags[core_id] = KA_STATE_DOZING;
>>> +    }
>>> +}
>>> +
>>>  void
>>>  ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
>>>                      uint64_t last_alive) @@ -166,18 +295,20 @@
>>> ka_init(const struct smap *ovs_other_config)
>>
>>This whole diff section is inappropriate.  Please don't add code in one patch
>>and then remove it in the next.
>
> [BHANU]  Its bit of realignment. I will fix it anyways.
>
>>
>>>  void
>>>  ka_destroy(void)
>>>  {
>>> -    if (ka_info) {
>>> -        struct ka_process_info *pinfo;
>>> -
>>> -        ovs_mutex_lock(&ka_info->proclist_mutex);
>>> -        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>>> -            free(pinfo);
>>> -        }
>>> -        ovs_mutex_unlock(&ka_info->proclist_mutex);
>>> +    if (!ka_info) {
>>> +        return;
>>> +    }
>>>
>>> -        hmap_destroy(&ka_info->process_list);
>>> -        ovs_mutex_destroy(&ka_info->proclist_mutex);
>>> +    struct ka_process_info *pinfo;
>>>
>>> -        free(ka_info);
>>> +    ovs_mutex_lock(&ka_info->proclist_mutex);
>>> +    HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>>> +        free(pinfo);
>>>      }
>>> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
>>> +
>>> +    hmap_destroy(&ka_info->process_list);
>>> +    ovs_mutex_destroy(&ka_info->proclist_mutex);
>>> +
>>> +    free(ka_info);
>>>  }
>>> diff --git a/lib/keepalive.h b/lib/keepalive.h index d133398..607ee3b
>>> 100644
>>> --- a/lib/keepalive.h
>>> +++ b/lib/keepalive.h
>>> @@ -38,8 +38,10 @@ enum keepalive_state {  typedef void
>>> (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
>>>
>>>  struct ka_process_info {
>>> +    char name[16];
>>>      int tid;
>>>      int core_id;
>>> +    bool heartbeats;
>>>      enum keepalive_state core_state;
>>>      uint64_t core_last_seen_times;
>>>      struct hmap_node node;
>>> @@ -52,6 +54,10 @@ struct keepalive_info {
>>>      /* List of process/threads monitored by KA framework. */
>>>      struct hmap process_list OVS_GUARDED;
>>>
>>> +    /* count of threads registered to KA framework. */
>>> +    uint32_t pmd_cnt;
>>> +    uint32_t nonpmd_cnt;
>>> +
>>
>>What are these counts for?
>
> [BHANU] This is to track the number of PMD threads and non PMD threads
> on compute and displayed with 'Keepalive/pmd-health-show'

Why do we need that?  Why not just status-show and have it dump
everything?

>>
>>>      /* Store Datapath threads 'tid'.
>>>       * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
>>>      pid_t thread_id[KA_DP_MAXCORES];
>>> @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum
>>> keepalive_state,
>>>
>>>  int ka_get_pmd_tid(unsigned core);
>>>  bool ka_is_enabled(void);
>>> +void ka_register_thread(int, bool);
>>> +void ka_unregister_thread(int, bool); void
>>> +ka_mark_pmd_thread_alive(unsigned);
>>> +void ka_mark_pmd_thread_sleep(unsigned);
>>> +
>>> +void ka_store_pmd_id(unsigned core);
>>> +uint32_t get_ka_interval(void);
>>> +int get_ka_init_status(void);
>>> +int ka_alloc_portstats(unsigned, int); void
>>> +ka_destroy_portstats(void);
>>> +
>>>  #endif /* keepalive.h */
>>> diff --git a/lib/util.c b/lib/util.c
>>> index 36e3731..83f7e0a 100644
>>> --- a/lib/util.c
>>> +++ b/lib/util.c
>>> @@ -26,6 +26,9 @@
>>>  #include <stdlib.h>
>>>  #include <string.h>
>>>  #include <sys/stat.h>
>>> +#ifdef __linux__
>>> +#include <sys/syscall.h>
>>> +#endif
>>>  #include <unistd.h>
>>>  #include "bitmap.h"
>>>  #include "byte-order.h"
>>> @@ -568,6 +571,16 @@ get_page_size(void)
>>>      return cached;
>>>  }
>>>
>>> +int
>>> +gettid(void)
>>> +{
>>> +#ifdef __linux__
>>> +    return (int)syscall(SYS_gettid);
>>
>>This returns a pid_t type.  Don't cast it to int, please.
>>
>>Also, call it ovs_gettid()
>
> [BHANU] OK.
>
>>
>>> +#endif
>>> +
>>> +    return -EINVAL;
>>> +}
>>> +
>>>  /* Returns the time at which the system booted, as the number of
>>milliseconds
>>>   * since the epoch, or 0 if the time of boot cannot be determined. */
>>> long long int @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds)
>>>      ovsrcu_quiesce_end();
>>>  }
>>>
>>> +void
>>> +xusleep(unsigned int microseconds)
>>> +{
>>> +    ovsrcu_quiesce_start();
>>> +#ifdef __linux__
>>> +    usleep(microseconds);
>>
>>I commented before on this not being put in a generic place since there's no
>>equivalent.
>>
>>Further, according to the manpages:
>>
>>POSIX.1-2001  declares this function obsolete; use nanosleep(2) instead.
>>POSIX.1-2008 removes  the  specification  of usleep().
>>
>>Please don't expose this through utils - ONLY linux will provide it, and it's
>>possible at some future point that glibc removes it.
>>
> [BHANU] Can you suggest suitable place for this API?

If it changes to using a posix nanosleep(), then it may be appropriate in the
ovs utils, but only when WIN32 is not defined.  As it stands using
usleep, it should probably be specific to the keepalive.c file - that
way other modules won't assume such functionality exists.

I recommend looking at the manpages for nanosleep(), and
clock_nanosleep() - paying attention to the interaction between various
clock sources and things like ntpd or administrative changes of the
realtime clock if you use sleeping functions.

>
>>> +#endif
>>> +    ovsrcu_quiesce_end();
>>> +}
>>> +
>>>  /* Determine whether standard output is a tty or not. This is useful to
>>decide
>>>   * whether to use color output or not when --color option for utilities is set
>>>   * to `auto`.
>>> diff --git a/lib/util.h b/lib/util.h
>>> index 764e0a0..f3cb61a 100644
>>> --- a/lib/util.h
>>> +++ b/lib/util.h
>>> @@ -120,6 +120,7 @@ const char *get_subprogram_name(void);
>>>
>>>  unsigned int get_page_size(void);
>>>  long long int get_boot_time(void);
>>> +int gettid(void);
>>>
>>>  void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
>>>
>>> @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
>>> }
>>>
>>>  void xsleep(unsigned int seconds);
>>> +void xusleep(unsigned int microseconds);
>>>
>>>  bool is_stdout_a_tty(void);
Bodireddy, Bhanuprakash Sept. 13, 2017, 4:13 p.m. UTC | #4
>"Bodireddy, Bhanuprakash" <bhanuprakash.bodireddy@intel.com> writes:
>
>>>Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:
>>>
>>>> This commit registers the packet processing PMD cores to keepalive
>>>> framework. Only PMDs that have rxqs mapped will be registered and
>>>> actively monitored by KA framework.
>>>>
>>>> This commit spawns a keepalive thread that will dispatch heartbeats
>>>> to PMD cores. The pmd threads respond to heartbeats by marking
>>>> themselves alive. As long as PMD responds to heartbeats it is considered
>'healthy'.
>>>>
>>>> Signed-off-by: Bhanuprakash Bodireddy
>>>> <bhanuprakash.bodireddy@intel.com>
>>>> ---
>>>
>>>I'm really confused with this patch.  I've stopped reviewing the series.
>>>
>>>It seems like there's a mix of 'track by core id' and 'track by thread id'.
>>>
>>>I don't think it's possible to do anything by core id.  We can never
>>>know what else has been scheduled on those cores, and we cannot be
>>>sure that a taskset or other scheduler provisioning call will move the
>threads.
>>
>> [BHANU] I have already answered this in other thread.
>> one can't correlate threads with cores and we shouldn't be tracking by
>> cores. However with PMD threads there is 1:1 mapping of PMD and the
>> core-id and it's safe to temporarily write PMD liveness info in to an
>> array indexed by core id before updating this in to HMAP.
>
>The core-id as a concept here is deceptive.  An external entity (such as
>taskset) can rebalance the PMDs.  External entities can be scheduled on the
>cores.  I think it's dangerous to have anything called core-id in this series or
>feature, because people will naturally infer things which aren't true.
>Additionally, it will lead to things like "well, we know that core x,y,z are
>running at A%, so we can schedule things thusly..."
>
>Makes sense?
>

The concerns above makes sense and you have a valid point. 
Unfortunately the logic that you see w.r.t PMD, core_id mapping is something that was
implemented in rte_keepalive library and I inherited it. As the 1:1 mapping of a thread(PMD)
to core is deceptive and makes little sense, I reworked on a different approach with no impact
on datapath performance. I was testing this last few days to check for perf impacts and other
possible issues.

Previous design:
--------------------
As part of heartbeat mechanism (dispath_heartbeats()),  in keepalive_info structure
we had arrays indexed by core-ids used by PMDs and Keepalive thread for heart-beating.
The arrays were used to keep the logic simple and lock-free.

Also Keepalive thread was updating the status periodically in to 'process_list' map using callback function.

New design:
-------------------
we already have a 'process_list' map where all the PMD threads are added by main(vswitchd)
thread. In this new approach, I take a copy of the 'process_list', let's call it 'cached_process_list'
and use this cached map for heartbeating between Keepalive and PMD threads. No locks are 
needed on the 'cached_process_list' there by not impacting the datapath performance.

Also whenever there is datapath reconfiguration(triggered by pmd-cpu-mask), the 'process_list' map 
will be updated and also the cached_process_list will be reloaded from the main map there by having
the maps in sync.  This is handled as part of ka_register_datapath_threads().  I have been testing this 
and seems to be working fine.

This way we can completely avoid all references to core_id in this series. Let me know if you have
any comments on this new approach.

Regards,
Bhanuprakash.
Aaron Conole Sept. 13, 2017, 4:56 p.m. UTC | #5
"Bodireddy, Bhanuprakash" <bhanuprakash.bodireddy@intel.com> writes:

>>"Bodireddy, Bhanuprakash" <bhanuprakash.bodireddy@intel.com> writes:
>>
>>>>Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:
>>>>
>>>>> This commit registers the packet processing PMD cores to keepalive
>>>>> framework. Only PMDs that have rxqs mapped will be registered and
>>>>> actively monitored by KA framework.
>>>>>
>>>>> This commit spawns a keepalive thread that will dispatch heartbeats
>>>>> to PMD cores. The pmd threads respond to heartbeats by marking
>>>>> themselves alive. As long as PMD responds to heartbeats it is considered
>>'healthy'.
>>>>>
>>>>> Signed-off-by: Bhanuprakash Bodireddy
>>>>> <bhanuprakash.bodireddy@intel.com>
>>>>> ---
>>>>
>>>>I'm really confused with this patch.  I've stopped reviewing the series.
>>>>
>>>>It seems like there's a mix of 'track by core id' and 'track by thread id'.
>>>>
>>>>I don't think it's possible to do anything by core id.  We can never
>>>>know what else has been scheduled on those cores, and we cannot be
>>>>sure that a taskset or other scheduler provisioning call will move the
>>threads.
>>>
>>> [BHANU] I have already answered this in other thread.
>>> one can't correlate threads with cores and we shouldn't be tracking by
>>> cores. However with PMD threads there is 1:1 mapping of PMD and the
>>> core-id and it's safe to temporarily write PMD liveness info in to an
>>> array indexed by core id before updating this in to HMAP.
>>
>>The core-id as a concept here is deceptive.  An external entity (such as
>>taskset) can rebalance the PMDs.  External entities can be scheduled on the
>>cores.  I think it's dangerous to have anything called core-id in this series or
>>feature, because people will naturally infer things which aren't true.
>>Additionally, it will lead to things like "well, we know that core x,y,z are
>>running at A%, so we can schedule things thusly..."
>>
>>Makes sense?
>>
>
> The concerns above makes sense and you have a valid point. 
> Unfortunately the logic that you see w.r.t PMD, core_id mapping is something that was
> implemented in rte_keepalive library and I inherited it. As the 1:1
> mapping of a thread(PMD)
> to core is deceptive and makes little sense, I reworked on a different
> approach with no impact
> on datapath performance. I was testing this last few days to check for
> perf impacts and other
> possible issues.
>
> Previous design:
> --------------------
> As part of heartbeat mechanism (dispath_heartbeats()),  in keepalive_info structure
> we had arrays indexed by core-ids used by PMDs and Keepalive thread for heart-beating.
> The arrays were used to keep the logic simple and lock-free.
>
> Also Keepalive thread was updating the status periodically in to
> 'process_list' map using callback function.
>
> New design:
> -------------------
> we already have a 'process_list' map where all the PMD threads are
> added by main(vswitchd)
> thread. In this new approach, I take a copy of the 'process_list',
> let's call it 'cached_process_list'
> and use this cached map for heartbeating between Keepalive and PMD
> threads. No locks are
> needed on the 'cached_process_list' there by not impacting the datapath performance.
>
> Also whenever there is datapath reconfiguration(triggered by
> pmd-cpu-mask), the 'process_list' map
> will be updated and also the cached_process_list will be reloaded from
> the main map there by having
> the maps in sync.  This is handled as part of
> ka_register_datapath_threads().  I have been testing this
> and seems to be working fine.
>
> This way we can completely avoid all references to core_id in this
> series. Let me know if you have
> any comments on this new approach.

Sounds good to me.  Looking forward to the code :-)

> Regards,
> Bhanuprakash.
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e2cd931..84c7ffd 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -49,6 +49,7 @@ 
 #include "flow.h"
 #include "hmapx.h"
 #include "id-pool.h"
+#include "keepalive.h"
 #include "latch.h"
 #include "netdev.h"
 #include "netdev-vport.h"
@@ -978,6 +979,63 @@  sorted_poll_thread_list(struct dp_netdev *dp,
     *n = k;
 }
 
+static void *
+ovs_keepalive(void *f_ OVS_UNUSED)
+{
+    pthread_detach(pthread_self());
+
+    for (;;) {
+        xusleep(get_ka_interval() * 1000);
+    }
+
+    return NULL;
+}
+
+static void
+ka_thread_start(struct dp_netdev *dp)
+{
+    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+
+    if (ovsthread_once_start(&once)) {
+        ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
+
+        ovsthread_once_done(&once);
+    }
+}
+
+static void
+ka_register_datapath_threads(struct dp_netdev *dp)
+{
+    int ka_init = get_ka_init_status();
+    VLOG_DBG("Keepalive: Was initialization successful? [%s]",
+                ka_init ? "Success" : "Failure");
+    if (!ka_init) {
+        return;
+    }
+
+    ka_thread_start(dp);
+
+    struct dp_netdev_pmd_thread *pmd;
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        /*  Register only PMD threads. */
+        if (pmd->core_id != NON_PMD_CORE_ID) {
+            int tid = ka_get_pmd_tid(pmd->core_id);
+
+            /* Skip PMD thread with no rxqs mapping. */
+            if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
+                /* rxq mapping changes due to reconfiguration,
+                 * if there are no rxqs mapped to PMD, unregister it. */
+                ka_unregister_thread(tid, true);
+                continue;
+            }
+
+            ka_register_thread(tid, true);
+            VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA framework",
+                      tid, pmd->core_id);
+        }
+    }
+}
+
 static void
 dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
                      void *aux)
@@ -3625,6 +3683,9 @@  reconfigure_datapath(struct dp_netdev *dp)
 
     /* Reload affected pmd threads. */
     reload_affected_pmds(dp);
+
+    /* Register datapath threads to KA monitoring. */
+    ka_register_datapath_threads(dp);
 }
 
 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
@@ -3824,6 +3885,8 @@  pmd_thread_main(void *f_)
 
     poll_list = NULL;
 
+    ka_store_pmd_id(pmd->core_id);
+
     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
     ovs_numa_thread_setaffinity_core(pmd->core_id);
@@ -3859,6 +3922,9 @@  reload:
                                                       : PMD_CYCLES_IDLE);
         }
 
+        /* Mark PMD thread alive. */
+        ka_mark_pmd_thread_alive(pmd->core_id);
+
         if (lc++ > 1024) {
             bool reload;
 
@@ -3892,6 +3958,10 @@  reload:
     }
 
     emc_cache_uninit(&pmd->flow_cache);
+
+    int tid = ka_get_pmd_tid(pmd->core_id);
+    ka_unregister_thread(tid, true);
+
     free(poll_list);
     pmd_free_cached_ports(pmd);
     return NULL;
diff --git a/lib/keepalive.c b/lib/keepalive.c
index ac73a42..dfafaeb 100644
--- a/lib/keepalive.c
+++ b/lib/keepalive.c
@@ -23,6 +23,7 @@ 
 #include "keepalive.h"
 #include "lib/vswitch-idl.h"
 #include "openvswitch/vlog.h"
+#include "process.h"
 #include "timeval.h"
 
 VLOG_DEFINE_THIS_MODULE(keepalive);
@@ -48,6 +49,134 @@  ka_get_pmd_tid(unsigned core_idx)
     return -EINVAL;
 }
 
+/* Return the Keepalive timer interval. */
+inline uint32_t
+get_ka_interval(void)
+{
+    return keepalive_timer_interval;
+}
+
+int
+get_ka_init_status(void)
+{
+    return ka_init_status;
+}
+
+void
+ka_store_pmd_id(unsigned core_idx)
+{
+    int tid = gettid();
+
+    if (ka_is_enabled()) {
+        ovs_assert(tid > 0);
+        ka_info->thread_id[core_idx] = tid;
+    }
+}
+
+/* Register thread to KA framework. */
+void
+ka_register_thread(int tid, bool thread_is_pmd)
+{
+    if (ka_is_enabled()) {
+        struct ka_process_info *ka_pinfo;
+        int core_num = -1;
+        char proc_name[18] = "UNDEFINED";
+
+        struct process_info pinfo;
+        int success = get_process_info(tid, &pinfo);
+        if (success) {
+            core_num = pinfo.core_id;
+            ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
+        }
+
+        ovs_assert(core_num >= 0);
+
+        uint32_t hash = hash_int(tid, 0);
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
+                                 hash, &ka_info->process_list) {
+            /* PMD thread is already registered. */
+            if (ka_pinfo->tid == tid) {
+                goto out;
+            }
+        }
+
+        ka_pinfo = xmalloc(sizeof *ka_pinfo);
+        ka_pinfo->tid = tid;
+        ka_pinfo->heartbeats = true;
+        ka_pinfo->core_id = core_num;
+        ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name);
+
+        hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
+
+        if (thread_is_pmd) {
+            ka_info->active_cores[core_num] = KA_STATE_ALIVE;
+            ka_info->last_alive[core_num] = time_wall_msec();
+            ka_info->pmd_cnt++;  /* Increment PMD count. */
+        } else {
+            ka_info->nonpmd_cnt++;  /* Increment non-pmd thread count. */
+        }
+out:
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    }
+}
+
+/* Unregister thread from KA framework. */
+void
+ka_unregister_thread(int tid, bool thread_is_pmd)
+{
+    if (ka_is_enabled()) {
+        struct ka_process_info *ka_pinfo;
+
+        int core_num = -1;
+        struct process_info pinfo;
+        if (get_process_info(tid, &pinfo)) {
+            core_num = pinfo.core_id;
+        }
+
+        ovs_assert(core_num >= 0);
+
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
+                                 &ka_info->process_list) {
+            /* If PMD thread is registered, remove it from the list */
+            if (ka_pinfo->tid == tid) {
+                hmap_remove(&ka_info->process_list, &ka_pinfo->node);
+                free(ka_pinfo);
+
+                if (thread_is_pmd) {
+                    ka_info->active_cores[core_num] = KA_STATE_UNUSED;
+                    ka_info->pmd_cnt--;  /* Decrement PMD count. */
+                } else {
+                    ka_info->nonpmd_cnt--;  /* Decrement non-pmd thread cnt. */
+                }
+
+                break;
+            }
+        }
+
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    }
+}
+
+/* Mark packet processing core alive. */
+void
+ka_mark_pmd_thread_alive(unsigned core_id)
+{
+    if (ka_is_enabled()) {
+        ka_info->state_flags[core_id] = KA_STATE_ALIVE;
+    }
+}
+
+/* Mark packet processing core as idle. */
+inline void
+ka_mark_pmd_thread_sleep(unsigned core_id)
+{
+    if (ka_is_enabled()) {
+        ka_info->state_flags[core_id] = KA_STATE_DOZING;
+    }
+}
+
 void
 ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
                     uint64_t last_alive)
@@ -166,18 +295,20 @@  ka_init(const struct smap *ovs_other_config)
 void
 ka_destroy(void)
 {
-    if (ka_info) {
-        struct ka_process_info *pinfo;
-
-        ovs_mutex_lock(&ka_info->proclist_mutex);
-        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
-            free(pinfo);
-        }
-        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    if (!ka_info) {
+        return;
+    }
 
-        hmap_destroy(&ka_info->process_list);
-        ovs_mutex_destroy(&ka_info->proclist_mutex);
+    struct ka_process_info *pinfo;
 
-        free(ka_info);
+    ovs_mutex_lock(&ka_info->proclist_mutex);
+    HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
+        free(pinfo);
     }
+    ovs_mutex_unlock(&ka_info->proclist_mutex);
+
+    hmap_destroy(&ka_info->process_list);
+    ovs_mutex_destroy(&ka_info->proclist_mutex);
+
+    free(ka_info);
 }
diff --git a/lib/keepalive.h b/lib/keepalive.h
index d133398..607ee3b 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -38,8 +38,10 @@  enum keepalive_state {
 typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
 
 struct ka_process_info {
+    char name[16];
     int tid;
     int core_id;
+    bool heartbeats;
     enum keepalive_state core_state;
     uint64_t core_last_seen_times;
     struct hmap_node node;
@@ -52,6 +54,10 @@  struct keepalive_info {
     /* List of process/threads monitored by KA framework. */
     struct hmap process_list OVS_GUARDED;
 
+    /* count of threads registered to KA framework. */
+    uint32_t pmd_cnt;
+    uint32_t nonpmd_cnt;
+
     /* Store Datapath threads 'tid'.
      * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
     pid_t thread_id[KA_DP_MAXCORES];
@@ -84,4 +90,15 @@  void ka_update_core_state(const int, const enum keepalive_state,
 
 int ka_get_pmd_tid(unsigned core);
 bool ka_is_enabled(void);
+void ka_register_thread(int, bool);
+void ka_unregister_thread(int, bool);
+void ka_mark_pmd_thread_alive(unsigned);
+void ka_mark_pmd_thread_sleep(unsigned);
+
+void ka_store_pmd_id(unsigned core);
+uint32_t get_ka_interval(void);
+int get_ka_init_status(void);
+int ka_alloc_portstats(unsigned, int);
+void ka_destroy_portstats(void);
+
 #endif /* keepalive.h */
diff --git a/lib/util.c b/lib/util.c
index 36e3731..83f7e0a 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -26,6 +26,9 @@ 
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
+#ifdef __linux__
+#include <sys/syscall.h>
+#endif
 #include <unistd.h>
 #include "bitmap.h"
 #include "byte-order.h"
@@ -568,6 +571,16 @@  get_page_size(void)
     return cached;
 }
 
+int
+gettid(void)
+{
+#ifdef __linux__
+    return (int)syscall(SYS_gettid);
+#endif
+
+    return -EINVAL;
+}
+
 /* Returns the time at which the system booted, as the number of milliseconds
  * since the epoch, or 0 if the time of boot cannot be determined. */
 long long int
@@ -2197,6 +2210,16 @@  xsleep(unsigned int seconds)
     ovsrcu_quiesce_end();
 }
 
+void
+xusleep(unsigned int microseconds)
+{
+    ovsrcu_quiesce_start();
+#ifdef __linux__
+    usleep(microseconds);
+#endif
+    ovsrcu_quiesce_end();
+}
+
 /* Determine whether standard output is a tty or not. This is useful to decide
  * whether to use color output or not when --color option for utilities is set
  * to `auto`.
diff --git a/lib/util.h b/lib/util.h
index 764e0a0..f3cb61a 100644
--- a/lib/util.h
+++ b/lib/util.h
@@ -120,6 +120,7 @@  const char *get_subprogram_name(void);
 
 unsigned int get_page_size(void);
 long long int get_boot_time(void);
+int gettid(void);
 
 void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
 
@@ -489,6 +490,7 @@  ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
 }
 
 void xsleep(unsigned int seconds);
+void xusleep(unsigned int microseconds);
 
 bool is_stdout_a_tty(void);