Message ID | 1547762360-7075-3-git-send-email-eranbe@mellanox.com |
---|---|
State | Accepted |
Delegated to: | David Miller |
Headers | show |
Series | Devlink health reporting and recovery system | expand |
Thu, Jan 17, 2019 at 10:59:11PM CET, eranbe@mellanox.com wrote: >Devlink health reporter is an instance for reporting, diagnosing and >recovering from run time errors discovered by the reporters. >Define it's data structure and supported operations. >In addition, expose devlink API to create and destroy a reporter. >Each devlink instance will hold it's own reporters list. > >As part of the allocation, driver shall provide a set of callbacks which >will be used the devlink in order to handle health reports and user >commands related to this reporter. In addition, driver is entitled to >provide some priv pointer, which can be fetched from the reporter by >devlink_health_reporter_priv function. > >For each reporter, devlink will hold a metadata of statistics, >buffers and status. > >Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> >Reviewed-by: Moshe Shemesh <moshe@mellanox.com> >--- > include/net/devlink.h | 59 ++++++++++++++++++++ > net/core/devlink.c | 127 ++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 186 insertions(+) > >diff --git a/include/net/devlink.h b/include/net/devlink.h >index 77c77319290a..7fe30d67678a 100644 >--- a/include/net/devlink.h >+++ b/include/net/devlink.h >@@ -30,6 +30,7 @@ struct devlink { > struct list_head param_list; > struct list_head region_list; > u32 snapshot_id; >+ struct list_head reporter_list; > struct devlink_dpipe_headers *dpipe_headers; > const struct devlink_ops *ops; > struct device *dev; >@@ -424,6 +425,34 @@ struct devlink_region; > typedef void devlink_snapshot_data_dest_t(const void *data); > > struct devlink_health_buffer; >+struct devlink_health_reporter; >+ >+/** >+ * struct devlink_health_reporter_ops - Reporter operations >+ * @name: reporter name >+ * dump_size: dump buffer size allocated by the devlink >+ * diagnose_size: diagnose buffer size allocated by the devlink >+ * recover: callback to recover from reported error >+ * if priv_ctx is NULL, run a full recover >+ * dump: callback to dump an object >+ * if priv_ctx is NULL, run a full dump >+ * diagnose: callback to diagnose the current status >+ */ >+ >+struct devlink_health_reporter_ops { >+ char *name; >+ unsigned int dump_size; >+ unsigned int diagnose_size; >+ int (*recover)(struct devlink_health_reporter *reporter, >+ void *priv_ctx); >+ int (*dump)(struct devlink_health_reporter *reporter, >+ struct devlink_health_buffer **buffers_array, >+ unsigned int buffer_size, unsigned int num_buffers, >+ void *priv_ctx); >+ int (*diagnose)(struct devlink_health_reporter *reporter, >+ struct devlink_health_buffer **buffers_array, >+ unsigned int buffer_size, unsigned int num_buffers); >+}; > > struct devlink_ops { > int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); >@@ -602,6 +631,16 @@ int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, > char *name); > int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, > void *data, int len); >+struct devlink_health_reporter * >+devlink_health_reporter_create(struct devlink *devlink, >+ const struct devlink_health_reporter_ops *ops, >+ u64 graceful_period, bool auto_recover, >+ void *priv); >+void >+devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); >+ >+void * >+devlink_health_reporter_priv(struct devlink_health_reporter *reporter); > #else > > static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, >@@ -920,6 +959,26 @@ devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, > { > return 0; > } >+ >+static inline struct devlink_health_reporter * >+devlink_health_reporter_create(struct devlink *devlink, >+ const struct devlink_health_reporter_ops *ops, >+ u64 graceful_period, bool auto_recover, >+ void *priv) >+{ >+ return NULL; >+} >+ >+static inline void >+devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) >+{ >+} >+ >+static inline void * >+devlink_health_reporter_priv(struct devlink_health_reporter *reporter) >+{ >+ return NULL; >+} > #endif > > #endif /* _NET_DEVLINK_H_ */ >diff --git a/net/core/devlink.c b/net/core/devlink.c >index 8984501edade..fec169a28dba 100644 >--- a/net/core/devlink.c >+++ b/net/core/devlink.c >@@ -4098,6 +4098,132 @@ devlink_health_buffer_snd(struct genl_info *info, > return err; > } > >+struct devlink_health_reporter { >+ struct list_head list; >+ struct devlink_health_buffer **dump_buffers_array; >+ struct mutex dump_lock; /* lock parallel read/write from dump buffers */ >+ struct devlink_health_buffer **diagnose_buffers_array; >+ struct mutex diagnose_lock; /* lock parallel read/write from diagnose buffers */ >+ void *priv; >+ const struct devlink_health_reporter_ops *ops; >+ struct devlink *devlink; >+ u64 graceful_period; >+ bool auto_recover; >+ u8 health_state; >+}; >+ >+void * >+devlink_health_reporter_priv(struct devlink_health_reporter *reporter) >+{ >+ return reporter->priv; >+} >+EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); >+ >+static struct devlink_health_reporter * >+devlink_health_reporter_find_by_name(struct devlink *devlink, >+ const char *reporter_name) >+{ >+ struct devlink_health_reporter *reporter; >+ >+ list_for_each_entry(reporter, &devlink->reporter_list, list) >+ if (!strcmp(reporter->ops->name, reporter_name)) >+ return reporter; >+ return NULL; >+} >+ >+/** >+ * devlink_health_reporter_create - create devlink health reporter >+ * >+ * @devlink: devlink >+ * @ops: ops >+ * @graceful_period: to avoid recovery loops, in msecs >+ * @auto_recover: auto recover when error occurs >+ * @priv: priv >+ */ >+struct devlink_health_reporter * >+devlink_health_reporter_create(struct devlink *devlink, >+ const struct devlink_health_reporter_ops *ops, >+ u64 graceful_period, bool auto_recover, >+ void *priv) >+{ >+ struct devlink_health_reporter *reporter; >+ >+ mutex_lock(&devlink->lock); >+ if (devlink_health_reporter_find_by_name(devlink, ops->name)) { >+ reporter = ERR_PTR(-EEXIST); >+ goto unlock; >+ } >+ >+ if (WARN_ON(ops->dump && !ops->dump_size) || >+ WARN_ON(ops->diagnose && !ops->diagnose_size) || >+ WARN_ON(auto_recover && !ops->recover) || >+ WARN_ON(graceful_period && !ops->recover)) { >+ reporter = ERR_PTR(-EINVAL); >+ goto unlock; >+ } >+ >+ reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); >+ if (!reporter) { >+ reporter = ERR_PTR(-ENOMEM); >+ goto unlock; >+ } >+ >+ if (ops->dump) { >+ reporter->dump_buffers_array = >+ devlink_health_buffers_create(ops->dump_size); >+ if (!reporter->dump_buffers_array) { >+ kfree(reporter); >+ reporter = ERR_PTR(-ENOMEM); >+ goto unlock; >+ } >+ } >+ >+ if (ops->diagnose) { >+ reporter->diagnose_buffers_array = >+ devlink_health_buffers_create(ops->diagnose_size); >+ if (!reporter->diagnose_buffers_array) { >+ devlink_health_buffers_destroy(reporter->dump_buffers_array, >+ DEVLINK_HEALTH_SIZE_TO_BUFFERS(ops->dump_size)); This is just ugly. :/ As I wrote in the other email, should be converted to simple "msg_ctx = msg_ctx_create(), msg_ctx_destroy(msg_ctx)", no sizes, no buffers visible to the driver. >+ kfree(reporter); >+ reporter = ERR_PTR(-ENOMEM); >+ goto unlock; >+ } >+ } >+ >+ list_add_tail(&reporter->list, &devlink->reporter_list); >+ mutex_init(&reporter->dump_lock); >+ mutex_init(&reporter->diagnose_lock); >+ >+ reporter->priv = priv; >+ reporter->ops = ops; >+ reporter->devlink = devlink; >+ reporter->graceful_period = graceful_period; >+ reporter->auto_recover = auto_recover; >+unlock: >+ mutex_unlock(&devlink->lock); >+ return reporter; >+} >+EXPORT_SYMBOL_GPL(devlink_health_reporter_create); >+ >+/** >+ * devlink_health_reporter_destroy - destroy devlink health reporter >+ * >+ * @reporter: devlink health reporter to destroy >+ */ >+void >+devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) >+{ >+ mutex_lock(&reporter->devlink->lock); >+ list_del(&reporter->list); >+ devlink_health_buffers_destroy(reporter->dump_buffers_array, >+ DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); >+ devlink_health_buffers_destroy(reporter->diagnose_buffers_array, >+ DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size)); >+ kfree(reporter); >+ mutex_unlock(&reporter->devlink->lock); >+} >+EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); >+ > static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { > [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, > [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, >@@ -4383,6 +4509,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) > INIT_LIST_HEAD(&devlink->resource_list); > INIT_LIST_HEAD(&devlink->param_list); > INIT_LIST_HEAD(&devlink->region_list); >+ INIT_LIST_HEAD(&devlink->reporter_list); > mutex_init(&devlink->lock); > return devlink; > } >-- >2.17.1 >
diff --git a/include/net/devlink.h b/include/net/devlink.h index 77c77319290a..7fe30d67678a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -30,6 +30,7 @@ struct devlink { struct list_head param_list; struct list_head region_list; u32 snapshot_id; + struct list_head reporter_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -424,6 +425,34 @@ struct devlink_region; typedef void devlink_snapshot_data_dest_t(const void *data); struct devlink_health_buffer; +struct devlink_health_reporter; + +/** + * struct devlink_health_reporter_ops - Reporter operations + * @name: reporter name + * dump_size: dump buffer size allocated by the devlink + * diagnose_size: diagnose buffer size allocated by the devlink + * recover: callback to recover from reported error + * if priv_ctx is NULL, run a full recover + * dump: callback to dump an object + * if priv_ctx is NULL, run a full dump + * diagnose: callback to diagnose the current status + */ + +struct devlink_health_reporter_ops { + char *name; + unsigned int dump_size; + unsigned int diagnose_size; + int (*recover)(struct devlink_health_reporter *reporter, + void *priv_ctx); + int (*dump)(struct devlink_health_reporter *reporter, + struct devlink_health_buffer **buffers_array, + unsigned int buffer_size, unsigned int num_buffers, + void *priv_ctx); + int (*diagnose)(struct devlink_health_reporter *reporter, + struct devlink_health_buffer **buffers_array, + unsigned int buffer_size, unsigned int num_buffers); +}; struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); @@ -602,6 +631,16 @@ int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, char *name); int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, void *data, int len); +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, bool auto_recover, + void *priv); +void +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); + +void * +devlink_health_reporter_priv(struct devlink_health_reporter *reporter); #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -920,6 +959,26 @@ devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, { return 0; } + +static inline struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, bool auto_recover, + void *priv) +{ + return NULL; +} + +static inline void +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ +} + +static inline void * +devlink_health_reporter_priv(struct devlink_health_reporter *reporter) +{ + return NULL; +} #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 8984501edade..fec169a28dba 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4098,6 +4098,132 @@ devlink_health_buffer_snd(struct genl_info *info, return err; } +struct devlink_health_reporter { + struct list_head list; + struct devlink_health_buffer **dump_buffers_array; + struct mutex dump_lock; /* lock parallel read/write from dump buffers */ + struct devlink_health_buffer **diagnose_buffers_array; + struct mutex diagnose_lock; /* lock parallel read/write from diagnose buffers */ + void *priv; + const struct devlink_health_reporter_ops *ops; + struct devlink *devlink; + u64 graceful_period; + bool auto_recover; + u8 health_state; +}; + +void * +devlink_health_reporter_priv(struct devlink_health_reporter *reporter) +{ + return reporter->priv; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); + +static struct devlink_health_reporter * +devlink_health_reporter_find_by_name(struct devlink *devlink, + const char *reporter_name) +{ + struct devlink_health_reporter *reporter; + + list_for_each_entry(reporter, &devlink->reporter_list, list) + if (!strcmp(reporter->ops->name, reporter_name)) + return reporter; + return NULL; +} + +/** + * devlink_health_reporter_create - create devlink health reporter + * + * @devlink: devlink + * @ops: ops + * @graceful_period: to avoid recovery loops, in msecs + * @auto_recover: auto recover when error occurs + * @priv: priv + */ +struct devlink_health_reporter * +devlink_health_reporter_create(struct devlink *devlink, + const struct devlink_health_reporter_ops *ops, + u64 graceful_period, bool auto_recover, + void *priv) +{ + struct devlink_health_reporter *reporter; + + mutex_lock(&devlink->lock); + if (devlink_health_reporter_find_by_name(devlink, ops->name)) { + reporter = ERR_PTR(-EEXIST); + goto unlock; + } + + if (WARN_ON(ops->dump && !ops->dump_size) || + WARN_ON(ops->diagnose && !ops->diagnose_size) || + WARN_ON(auto_recover && !ops->recover) || + WARN_ON(graceful_period && !ops->recover)) { + reporter = ERR_PTR(-EINVAL); + goto unlock; + } + + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); + if (!reporter) { + reporter = ERR_PTR(-ENOMEM); + goto unlock; + } + + if (ops->dump) { + reporter->dump_buffers_array = + devlink_health_buffers_create(ops->dump_size); + if (!reporter->dump_buffers_array) { + kfree(reporter); + reporter = ERR_PTR(-ENOMEM); + goto unlock; + } + } + + if (ops->diagnose) { + reporter->diagnose_buffers_array = + devlink_health_buffers_create(ops->diagnose_size); + if (!reporter->diagnose_buffers_array) { + devlink_health_buffers_destroy(reporter->dump_buffers_array, + DEVLINK_HEALTH_SIZE_TO_BUFFERS(ops->dump_size)); + kfree(reporter); + reporter = ERR_PTR(-ENOMEM); + goto unlock; + } + } + + list_add_tail(&reporter->list, &devlink->reporter_list); + mutex_init(&reporter->dump_lock); + mutex_init(&reporter->diagnose_lock); + + reporter->priv = priv; + reporter->ops = ops; + reporter->devlink = devlink; + reporter->graceful_period = graceful_period; + reporter->auto_recover = auto_recover; +unlock: + mutex_unlock(&devlink->lock); + return reporter; +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_create); + +/** + * devlink_health_reporter_destroy - destroy devlink health reporter + * + * @reporter: devlink health reporter to destroy + */ +void +devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) +{ + mutex_lock(&reporter->devlink->lock); + list_del(&reporter->list); + devlink_health_buffers_destroy(reporter->dump_buffers_array, + DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); + devlink_health_buffers_destroy(reporter->diagnose_buffers_array, + DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size)); + kfree(reporter); + mutex_unlock(&reporter->devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4383,6 +4509,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->param_list); INIT_LIST_HEAD(&devlink->region_list); + INIT_LIST_HEAD(&devlink->reporter_list); mutex_init(&devlink->lock); return devlink; }