@@ -117,6 +117,7 @@ ka_register_thread(int tid, bool thread_is_pmd)
ka_pinfo->core_id = core_num;
ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name);
ka_pinfo->healthcheck = PMD_HC_DISABLE;
+ ka_pinfo->failures = 0;
hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
@@ -278,6 +279,21 @@ ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
}
void
+ka_inc_pmd_failures(unsigned core_id)
+{
+ struct ka_process_info *pinfo;
+ int tid = ka_get_pmd_tid(core_id);
+ ovs_mutex_lock(&ka_info->proclist_mutex);
+ HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
+ &ka_info->process_list) {
+ if (pinfo->core_id == core_id) {
+ pinfo->failures++;
+ }
+ }
+ ovs_mutex_unlock(&ka_info->proclist_mutex);
+}
+
+void
ka_load_process_list(struct hmap **process_list)
{
if (ka_is_enabled()) {
@@ -64,6 +64,7 @@ struct ka_process_info {
enum pmdhealth_check healthcheck;
enum keepalive_state core_state;
uint64_t core_last_seen_times;
+ int failures;
struct hmap_node node;
};
@@ -127,6 +128,7 @@ void ka_disable_pmd_health_check(unsigned);
bool ka_is_pmdhealth_check_enabled(unsigned);
enum pmdhealth_check ka_get_pmd_health_check_state(unsigned);
void ka_set_pmd_health_check_state(unsigned, enum pmdhealth_check);
+void ka_inc_pmd_failures(unsigned);
void ka_store_pmd_id(unsigned core);
uint32_t get_ka_interval(void);
@@ -622,6 +622,52 @@ dpdk_failcore_cb(void *ptr_data OVS_UNUSED, const int core_id)
}
}
+static void
+dpdk_ka_handle_failure(enum keepalive_state fail_state, const int core_id,
+ const enum rte_keepalive_state core_state,
+ uint64_t last_alive)
+{
+ if (fail_state == KA_STATE_DEAD) {
+ /* If process is in DEFUNC/UNINTERRUPTIBLE/TRACED state it is inactive
+ * and no additional health checks are needed. */
+ uint32_t tid = ka_get_pmd_tid(core_id);
+ if (process_is_active(tid)) {
+ /* Enable PMD health check only when PMD is in 'RUNNING' state and
+ * still doesn't respond to heartbeats. Health checks are needed to
+ * analyze other stats as we are in penultimate state of declaring
+ * PMD as failed. */
+ ka_enable_pmd_health_check(core_id);
+ }
+ ka_set_pmd_state_ts(core_id, KA_STATE_DEAD, last_alive);
+ }
+
+ if (fail_state == KA_STATE_GONE) {
+ int pmd_hc_state = ka_get_pmd_health_check_state(core_id);
+
+ switch (pmd_hc_state) {
+ case PMD_HC_ENABLE:
+ break;
+ case PMD_HC_DISABLE:
+ VLOG_DBG_RL(&rl, "PMD thread [%d] died, health check disabled",
+ core_id);
+ break;
+ case PMD_HC_PROGRESS:
+ ka_set_pmd_state_ts(core_id, KA_STATE_CHECK, last_alive);
+ break;
+
+ case PMD_HC_COMPLETE:
+ ka_inc_pmd_failures(core_id);
+ ka_set_pmd_state_ts(core_id, core_state, last_alive);
+ ka_disable_pmd_health_check(core_id);
+ break;
+
+ default:
+ VLOG_DBG_RL(&rl, "Unknown health check state %d", pmd_hc_state);
+ OVS_NOT_REACHED();
+ }
+ }
+}
+
/*
* This function shall be invoked periodically to write the core status and
* last seen timestamp of the cores in to keepalive info structure.
@@ -634,11 +680,23 @@ dpdk_ka_update_core_state(void *ptr_data OVS_UNUSED, const int core_id,
case RTE_KA_STATE_ALIVE:
case RTE_KA_STATE_MISSING:
ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
+
+ /* Health checks should be disabled when PMD is alive. */
+ if (OVS_UNLIKELY(ka_get_pmd_health_check_state(core_id) !=
+ PMD_HC_DISABLE)) {
+ ka_disable_pmd_health_check(core_id);
+ }
break;
- case RTE_KA_STATE_DOZING:
- case RTE_KA_STATE_SLEEP:
case RTE_KA_STATE_DEAD:
+ dpdk_ka_handle_failure(KA_STATE_DEAD, core_id, core_state,
+ last_alive);
+ break;
case RTE_KA_STATE_GONE:
+ dpdk_ka_handle_failure(KA_STATE_GONE, core_id, core_state,
+ last_alive);
+ break;
+ case RTE_KA_STATE_DOZING:
+ case RTE_KA_STATE_SLEEP:
ka_set_pmd_state_ts(core_id, core_state, last_alive);
break;
case RTE_KA_STATE_UNUSED:
The keepalive thread sends heartbeats to PMD thread and when PMD fails to respond to successive heartbeats the PMD is potentially stalled. The PMD state transition is as below: ALIVE -> MISSING -> DEAD -> GONE This commit enables PMD healthchecks when PMD doesn't respond to heartbeats. This is needed to handle false negatives. With this commit the new state transition is as below: ALIVE -> MISSING -> DEAD -> CHECK -> GONE PMD Health checking state is introduced and will immediately kickin when the PMD gets in to DEAD state. As part of this below are considered. - Link status of the ports polled by PMD thread. - Statistics of the ports polled by PMD thread. - PMD polling and processing cycles. Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> --- lib/keepalive.c | 16 ++++++++++++++ lib/keepalive.h | 2 ++ lib/netdev-dpdk.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 78 insertions(+), 2 deletions(-)