Message ID | 1585142784-10517-3-git-send-email-eranbe@mellanox.com |
---|---|
State | Changes Requested |
Delegated to: | David Miller |
Headers | show |
Series | Devlink health auto attributes refactor | expand |
On Wed, 25 Mar 2020 15:26:24 +0200 Eran Ben Elisha wrote: > On low memory system, run time dumps can consume too much memory. Add > administrator ability to disable auto dumps per reporter as part of the > error flow handle routine. > > This attribute is not relevant while executing > DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. > > By default, auto dump is activated for any reporter that has a dump method, > as part of the reporter registration to devlink. > > Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> > Reviewed-by: Jiri Pirko <jiri@mellanox.com> > --- > include/uapi/linux/devlink.h | 2 ++ > net/core/devlink.c | 26 ++++++++++++++++++++++---- > 2 files changed, 24 insertions(+), 4 deletions(-) > > diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h > index dfdffc42e87d..e7891d1d2ebd 100644 > --- a/include/uapi/linux/devlink.h > +++ b/include/uapi/linux/devlink.h > @@ -429,6 +429,8 @@ enum devlink_attr { > DEVLINK_ATTR_NETNS_FD, /* u32 */ > DEVLINK_ATTR_NETNS_PID, /* u32 */ > DEVLINK_ATTR_NETNS_ID, /* u32 */ > + > + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ > /* add new attributes above here, update the policy in devlink.c */ > > __DEVLINK_ATTR_MAX, > diff --git a/net/core/devlink.c b/net/core/devlink.c > index ad69379747ef..e14bf3052289 100644 > --- a/net/core/devlink.c > +++ b/net/core/devlink.c > @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { > struct mutex dump_lock; /* lock parallel read/write from dump buffers */ > u64 graceful_period; > bool auto_recover; > + bool auto_dump; > u8 health_state; > u64 dump_ts; > u64 dump_real_ts; > @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, > reporter->devlink = devlink; > reporter->graceful_period = graceful_period; > reporter->auto_recover = !!ops->recover; > + reporter->auto_dump = !!ops->dump; > mutex_init(&reporter->dump_lock); > refcount_set(&reporter->refcount, 1); > list_add_tail(&reporter->list, &devlink->reporter_list); > @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, > nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, > reporter->dump_real_ts, DEVLINK_ATTR_PAD)) > goto reporter_nest_cancel; > + if (reporter->ops->dump && > + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, > + reporter->auto_dump)) > + goto reporter_nest_cancel; Since you're making it a u8 - does it make sense to indicate to user space whether the dump is disabled or not supported? Right now no attribute means either old kernel or dump not possible.. > nla_nest_end(msg, reporter_attr); > genlmsg_end(msg, hdr); > @@ -5129,10 +5135,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, > > reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; > > - mutex_lock(&reporter->dump_lock); > - /* store current dump of current error, for later analysis */ > - devlink_health_do_dump(reporter, priv_ctx, NULL); > - mutex_unlock(&reporter->dump_lock); > + if (reporter->auto_dump) { > + mutex_lock(&reporter->dump_lock); > + /* store current dump of current error, for later analysis */ > + devlink_health_do_dump(reporter, priv_ctx, NULL); > + mutex_unlock(&reporter->dump_lock); > + } > > if (reporter->auto_recover) > return devlink_health_reporter_recover(reporter, > @@ -5306,6 +5314,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, > err = -EOPNOTSUPP; > goto out; > } > + if (!reporter->ops->dump && > + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { ... and then this behavior may have to change, I think? > + err = -EOPNOTSUPP; > + goto out; > + } > > if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) > reporter->graceful_period = > @@ -5315,6 +5328,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, > reporter->auto_recover = > nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); > > + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) > + reporter->auto_dump = > + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); > + > devlink_health_reporter_put(reporter); > return 0; > out: > @@ -6053,6 +6070,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { > [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, > [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, > [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, > + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, I'd suggest we keep the attrs in order of definition, because we should set .strict_start_type, and then it matters which are before and which are after. Also please set max value of 1. > [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, > [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, > [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },
Wed, Mar 25, 2020 at 07:45:29PM CET, kuba@kernel.org wrote: >On Wed, 25 Mar 2020 15:26:24 +0200 Eran Ben Elisha wrote: >> On low memory system, run time dumps can consume too much memory. Add >> administrator ability to disable auto dumps per reporter as part of the >> error flow handle routine. >> >> This attribute is not relevant while executing >> DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. >> >> By default, auto dump is activated for any reporter that has a dump method, >> as part of the reporter registration to devlink. >> >> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> >> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >> --- >> include/uapi/linux/devlink.h | 2 ++ >> net/core/devlink.c | 26 ++++++++++++++++++++++---- >> 2 files changed, 24 insertions(+), 4 deletions(-) >> >> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >> index dfdffc42e87d..e7891d1d2ebd 100644 >> --- a/include/uapi/linux/devlink.h >> +++ b/include/uapi/linux/devlink.h >> @@ -429,6 +429,8 @@ enum devlink_attr { >> DEVLINK_ATTR_NETNS_FD, /* u32 */ >> DEVLINK_ATTR_NETNS_PID, /* u32 */ >> DEVLINK_ATTR_NETNS_ID, /* u32 */ >> + >> + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ >> /* add new attributes above here, update the policy in devlink.c */ >> >> __DEVLINK_ATTR_MAX, >> diff --git a/net/core/devlink.c b/net/core/devlink.c >> index ad69379747ef..e14bf3052289 100644 >> --- a/net/core/devlink.c >> +++ b/net/core/devlink.c >> @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { >> struct mutex dump_lock; /* lock parallel read/write from dump buffers */ >> u64 graceful_period; >> bool auto_recover; >> + bool auto_dump; >> u8 health_state; >> u64 dump_ts; >> u64 dump_real_ts; >> @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, >> reporter->devlink = devlink; >> reporter->graceful_period = graceful_period; >> reporter->auto_recover = !!ops->recover; >> + reporter->auto_dump = !!ops->dump; >> mutex_init(&reporter->dump_lock); >> refcount_set(&reporter->refcount, 1); >> list_add_tail(&reporter->list, &devlink->reporter_list); >> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, >> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, >> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) >> goto reporter_nest_cancel; >> + if (reporter->ops->dump && >> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, >> + reporter->auto_dump)) >> + goto reporter_nest_cancel; > >Since you're making it a u8 - does it make sense to indicate to user Please don't be mistaken. u8 carries a bool here. >space whether the dump is disabled or not supported? If you want to expose "not supported", I suggest to do it in another attr. Because this attr is here to do the config from userspace. Would be off if the same enum would carry "not supported". But anyway, since you opened this can, the supported/capabilities should be probably passed by a separate bitfield for all features. > >Right now no attribute means either old kernel or dump not possible.. > >> nla_nest_end(msg, reporter_attr); >> genlmsg_end(msg, hdr); >> @@ -5129,10 +5135,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, >> >> reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; >> >> - mutex_lock(&reporter->dump_lock); >> - /* store current dump of current error, for later analysis */ >> - devlink_health_do_dump(reporter, priv_ctx, NULL); >> - mutex_unlock(&reporter->dump_lock); >> + if (reporter->auto_dump) { >> + mutex_lock(&reporter->dump_lock); >> + /* store current dump of current error, for later analysis */ >> + devlink_health_do_dump(reporter, priv_ctx, NULL); >> + mutex_unlock(&reporter->dump_lock); >> + } >> >> if (reporter->auto_recover) >> return devlink_health_reporter_recover(reporter, >> @@ -5306,6 +5314,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, >> err = -EOPNOTSUPP; >> goto out; >> } >> + if (!reporter->ops->dump && >> + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { > >... and then this behavior may have to change, I think? > >> + err = -EOPNOTSUPP; >> + goto out; >> + } >> >> if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) >> reporter->graceful_period = >> @@ -5315,6 +5328,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, >> reporter->auto_recover = >> nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); >> >> + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) >> + reporter->auto_dump = >> + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); >> + >> devlink_health_reporter_put(reporter); >> return 0; >> out: >> @@ -6053,6 +6070,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { >> [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, >> [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, >> [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, >> + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, > >I'd suggest we keep the attrs in order of definition, because we should >set .strict_start_type, and then it matters which are before and which >are after. > >Also please set max value of 1. > >> [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, >> [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, >> [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, >
On 3/25/2020 9:08 PM, Jiri Pirko wrote: > Wed, Mar 25, 2020 at 07:45:29PM CET, kuba@kernel.org wrote: >> On Wed, 25 Mar 2020 15:26:24 +0200 Eran Ben Elisha wrote: >>> On low memory system, run time dumps can consume too much memory. Add >>> administrator ability to disable auto dumps per reporter as part of the >>> error flow handle routine. >>> >>> This attribute is not relevant while executing >>> DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. >>> >>> By default, auto dump is activated for any reporter that has a dump method, >>> as part of the reporter registration to devlink. >>> >>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> >>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >>> --- >>> include/uapi/linux/devlink.h | 2 ++ >>> net/core/devlink.c | 26 ++++++++++++++++++++++---- >>> 2 files changed, 24 insertions(+), 4 deletions(-) >>> >>> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >>> index dfdffc42e87d..e7891d1d2ebd 100644 >>> --- a/include/uapi/linux/devlink.h >>> +++ b/include/uapi/linux/devlink.h >>> @@ -429,6 +429,8 @@ enum devlink_attr { >>> DEVLINK_ATTR_NETNS_FD, /* u32 */ >>> DEVLINK_ATTR_NETNS_PID, /* u32 */ >>> DEVLINK_ATTR_NETNS_ID, /* u32 */ >>> + >>> + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ >>> /* add new attributes above here, update the policy in devlink.c */ >>> >>> __DEVLINK_ATTR_MAX, >>> diff --git a/net/core/devlink.c b/net/core/devlink.c >>> index ad69379747ef..e14bf3052289 100644 >>> --- a/net/core/devlink.c >>> +++ b/net/core/devlink.c >>> @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { >>> struct mutex dump_lock; /* lock parallel read/write from dump buffers */ >>> u64 graceful_period; >>> bool auto_recover; >>> + bool auto_dump; >>> u8 health_state; >>> u64 dump_ts; >>> u64 dump_real_ts; >>> @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, >>> reporter->devlink = devlink; >>> reporter->graceful_period = graceful_period; >>> reporter->auto_recover = !!ops->recover; >>> + reporter->auto_dump = !!ops->dump; >>> mutex_init(&reporter->dump_lock); >>> refcount_set(&reporter->refcount, 1); >>> list_add_tail(&reporter->list, &devlink->reporter_list); >>> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, >>> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, >>> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) >>> goto reporter_nest_cancel; >>> + if (reporter->ops->dump && >>> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, >>> + reporter->auto_dump)) >>> + goto reporter_nest_cancel; >> >> Since you're making it a u8 - does it make sense to indicate to user > > Please don't be mistaken. u8 carries a bool here. > > >> space whether the dump is disabled or not supported? > > If you want to expose "not supported", I suggest to do it in another > attr. Because this attr is here to do the config from userspace. Would > be off if the same enum would carry "not supported". > > But anyway, since you opened this can, the supported/capabilities > should be probably passed by a separate bitfield for all features. > Actually we supports trinary state per x attribute. (x can be auto-dump or auto-recover which is supported since day 1) devlink_nl_health_reporter_fill can set DEVLINK_ATTR_HEALTH_REPORTER_AUTO_X to {0 , 1 , no set} And user space devlink propagates it accordingly. If auto_x is supported, user will see "auto_x true" If auto_x is not supported, user will see "auto_x false" If x is not supported at all, user will not the auto_x at all for this reporter. > >> >> Right now no attribute means either old kernel or dump not possible. when you run on old kernel, you will not see auto-dump attribute at all. In any case you won't be able to distinguish in user space between {auto-dump, no-auto-dump, no dump support}. >> >>> nla_nest_end(msg, reporter_attr); >>> genlmsg_end(msg, hdr); >>> @@ -5129,10 +5135,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, >>> >>> reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; >>> >>> - mutex_lock(&reporter->dump_lock); >>> - /* store current dump of current error, for later analysis */ >>> - devlink_health_do_dump(reporter, priv_ctx, NULL); >>> - mutex_unlock(&reporter->dump_lock); >>> + if (reporter->auto_dump) { >>> + mutex_lock(&reporter->dump_lock); >>> + /* store current dump of current error, for later analysis */ >>> + devlink_health_do_dump(reporter, priv_ctx, NULL); >>> + mutex_unlock(&reporter->dump_lock); >>> + } >>> >>> if (reporter->auto_recover) >>> return devlink_health_reporter_recover(reporter, >>> @@ -5306,6 +5314,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, >>> err = -EOPNOTSUPP; >>> goto out; >>> } >>> + if (!reporter->ops->dump && >>> + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { >> >> ... and then this behavior may have to change, I think? >> >>> + err = -EOPNOTSUPP; >>> + goto out; >>> + } >>> >>> if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) >>> reporter->graceful_period = >>> @@ -5315,6 +5328,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, >>> reporter->auto_recover = >>> nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); >>> >>> + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) >>> + reporter->auto_dump = >>> + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); >>> + >>> devlink_health_reporter_put(reporter); >>> return 0; >>> out: >>> @@ -6053,6 +6070,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { >>> [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, >>> [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, >>> [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, >>> + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, >> >> I'd suggest we keep the attrs in order of definition, because we should >> set .strict_start_type, and then it matters which are before and which >> are after. >> >> Also please set max value of 1. >> >>> [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, >>> [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, >>> [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING }, >>
On Wed, 25 Mar 2020 21:38:35 +0200 Eran Ben Elisha wrote: > On 3/25/2020 9:08 PM, Jiri Pirko wrote: > > Wed, Mar 25, 2020 at 07:45:29PM CET, kuba@kernel.org wrote: > >> On Wed, 25 Mar 2020 15:26:24 +0200 Eran Ben Elisha wrote: > >>> On low memory system, run time dumps can consume too much memory. Add > >>> administrator ability to disable auto dumps per reporter as part of the > >>> error flow handle routine. > >>> > >>> This attribute is not relevant while executing > >>> DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. > >>> > >>> By default, auto dump is activated for any reporter that has a dump method, > >>> as part of the reporter registration to devlink. > >>> > >>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> > >>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> > >>> --- > >>> include/uapi/linux/devlink.h | 2 ++ > >>> net/core/devlink.c | 26 ++++++++++++++++++++++---- > >>> 2 files changed, 24 insertions(+), 4 deletions(-) > >>> > >>> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h > >>> index dfdffc42e87d..e7891d1d2ebd 100644 > >>> --- a/include/uapi/linux/devlink.h > >>> +++ b/include/uapi/linux/devlink.h > >>> @@ -429,6 +429,8 @@ enum devlink_attr { > >>> DEVLINK_ATTR_NETNS_FD, /* u32 */ > >>> DEVLINK_ATTR_NETNS_PID, /* u32 */ > >>> DEVLINK_ATTR_NETNS_ID, /* u32 */ > >>> + > >>> + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ > >>> /* add new attributes above here, update the policy in devlink.c */ > >>> > >>> __DEVLINK_ATTR_MAX, > >>> diff --git a/net/core/devlink.c b/net/core/devlink.c > >>> index ad69379747ef..e14bf3052289 100644 > >>> --- a/net/core/devlink.c > >>> +++ b/net/core/devlink.c > >>> @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { > >>> struct mutex dump_lock; /* lock parallel read/write from dump buffers */ > >>> u64 graceful_period; > >>> bool auto_recover; > >>> + bool auto_dump; > >>> u8 health_state; > >>> u64 dump_ts; > >>> u64 dump_real_ts; > >>> @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, > >>> reporter->devlink = devlink; > >>> reporter->graceful_period = graceful_period; > >>> reporter->auto_recover = !!ops->recover; > >>> + reporter->auto_dump = !!ops->dump; > >>> mutex_init(&reporter->dump_lock); > >>> refcount_set(&reporter->refcount, 1); > >>> list_add_tail(&reporter->list, &devlink->reporter_list); > >>> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, > >>> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, > >>> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) > >>> goto reporter_nest_cancel; > >>> + if (reporter->ops->dump && > >>> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, > >>> + reporter->auto_dump)) > >>> + goto reporter_nest_cancel; > >> > >> Since you're making it a u8 - does it make sense to indicate to user > > > > Please don't be mistaken. u8 carries a bool here. Are you okay with limiting the value in the policy? I guess the auto-recover doesn't have it so we'd create a little inconsistency. > >> space whether the dump is disabled or not supported? > > > > If you want to expose "not supported", I suggest to do it in another > > attr. Because this attr is here to do the config from userspace. Would > > be off if the same enum would carry "not supported". > > > > But anyway, since you opened this can, the supported/capabilities > > should be probably passed by a separate bitfield for all features. > > > > Actually we supports trinary state per x attribute. > (x can be auto-dump or auto-recover which is supported since day 1) > > devlink_nl_health_reporter_fill can set > DEVLINK_ATTR_HEALTH_REPORTER_AUTO_X to {0 , 1 , no set} > And user space devlink propagates it accordingly. > > If auto_x is supported, user will see "auto_x true" > If auto_x is not supported, user will see "auto_x false" > If x is not supported at all, user will not the auto_x at all for this > reporter. Yeah, makes perfect the only glitch is that for auto-dump we have the old kernel case. auto-recover was there from day 1. > >> Right now no attribute means either old kernel or dump not possible. > when you run on old kernel, you will not see auto-dump attribute at all. > In any case you won't be able to distinguish in user space between > {auto-dump, no-auto-dump, no dump support}. Right. Anyway, I don't think this will matter in this particular case in practice, so if you're okay with the code as is: Reviewed-by: Jakub Kicinski <kuba@kernel.org>
On 3/26/2020 2:01 AM, Jakub Kicinski wrote: > On Wed, 25 Mar 2020 21:38:35 +0200 Eran Ben Elisha wrote: >> On 3/25/2020 9:08 PM, Jiri Pirko wrote: >>> Wed, Mar 25, 2020 at 07:45:29PM CET, kuba@kernel.org wrote: >>>> On Wed, 25 Mar 2020 15:26:24 +0200 Eran Ben Elisha wrote: >>>>> On low memory system, run time dumps can consume too much memory. Add >>>>> administrator ability to disable auto dumps per reporter as part of the >>>>> error flow handle routine. >>>>> >>>>> This attribute is not relevant while executing >>>>> DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. >>>>> >>>>> By default, auto dump is activated for any reporter that has a dump method, >>>>> as part of the reporter registration to devlink. >>>>> >>>>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> >>>>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >>>>> --- >>>>> include/uapi/linux/devlink.h | 2 ++ >>>>> net/core/devlink.c | 26 ++++++++++++++++++++++---- >>>>> 2 files changed, 24 insertions(+), 4 deletions(-) >>>>> >>>>> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >>>>> index dfdffc42e87d..e7891d1d2ebd 100644 >>>>> --- a/include/uapi/linux/devlink.h >>>>> +++ b/include/uapi/linux/devlink.h >>>>> @@ -429,6 +429,8 @@ enum devlink_attr { >>>>> DEVLINK_ATTR_NETNS_FD, /* u32 */ >>>>> DEVLINK_ATTR_NETNS_PID, /* u32 */ >>>>> DEVLINK_ATTR_NETNS_ID, /* u32 */ >>>>> + >>>>> + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ >>>>> /* add new attributes above here, update the policy in devlink.c */ >>>>> >>>>> __DEVLINK_ATTR_MAX, >>>>> diff --git a/net/core/devlink.c b/net/core/devlink.c >>>>> index ad69379747ef..e14bf3052289 100644 >>>>> --- a/net/core/devlink.c >>>>> +++ b/net/core/devlink.c >>>>> @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { >>>>> struct mutex dump_lock; /* lock parallel read/write from dump buffers */ >>>>> u64 graceful_period; >>>>> bool auto_recover; >>>>> + bool auto_dump; >>>>> u8 health_state; >>>>> u64 dump_ts; >>>>> u64 dump_real_ts; >>>>> @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, >>>>> reporter->devlink = devlink; >>>>> reporter->graceful_period = graceful_period; >>>>> reporter->auto_recover = !!ops->recover; >>>>> + reporter->auto_dump = !!ops->dump; >>>>> mutex_init(&reporter->dump_lock); >>>>> refcount_set(&reporter->refcount, 1); >>>>> list_add_tail(&reporter->list, &devlink->reporter_list); >>>>> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, >>>>> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, >>>>> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) >>>>> goto reporter_nest_cancel; >>>>> + if (reporter->ops->dump && >>>>> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, >>>>> + reporter->auto_dump)) >>>>> + goto reporter_nest_cancel; >>>> >>>> Since you're making it a u8 - does it make sense to indicate to user >>> >>> Please don't be mistaken. u8 carries a bool here. > > Are you okay with limiting the value in the policy? > > I guess the auto-recover doesn't have it so we'd create a little > inconsistency. > >>>> space whether the dump is disabled or not supported? >>> >>> If you want to expose "not supported", I suggest to do it in another >>> attr. Because this attr is here to do the config from userspace. Would >>> be off if the same enum would carry "not supported". >>> >>> But anyway, since you opened this can, the supported/capabilities >>> should be probably passed by a separate bitfield for all features. >>> >> >> Actually we supports trinary state per x attribute. >> (x can be auto-dump or auto-recover which is supported since day 1) >> >> devlink_nl_health_reporter_fill can set >> DEVLINK_ATTR_HEALTH_REPORTER_AUTO_X to {0 , 1 , no set} >> And user space devlink propagates it accordingly. >> >> If auto_x is supported, user will see "auto_x true" >> If auto_x is not supported, user will see "auto_x false" >> If x is not supported at all, user will not the auto_x at all for this >> reporter. > > Yeah, makes perfect the only glitch is that for auto-dump we have the > old kernel case. auto-recover was there from day 1. > >>>> Right now no attribute means either old kernel or dump not possible. >> when you run on old kernel, you will not see auto-dump attribute at all. >> In any case you won't be able to distinguish in user space between >> {auto-dump, no-auto-dump, no dump support}. > > Right. > > Anyway, I don't think this will matter in this particular case in > practice, so if you're okay with the code as is: > > Reviewed-by: Jakub Kicinski <kuba@kernel.org> > Thanks Jakub. Dave, I see the patchset is currently marked as Changes Requested. Can you please modify?
Thu, Mar 26, 2020 at 01:01:35AM CET, kuba@kernel.org wrote: >On Wed, 25 Mar 2020 21:38:35 +0200 Eran Ben Elisha wrote: >> On 3/25/2020 9:08 PM, Jiri Pirko wrote: >> > Wed, Mar 25, 2020 at 07:45:29PM CET, kuba@kernel.org wrote: >> >> On Wed, 25 Mar 2020 15:26:24 +0200 Eran Ben Elisha wrote: >> >>> On low memory system, run time dumps can consume too much memory. Add >> >>> administrator ability to disable auto dumps per reporter as part of the >> >>> error flow handle routine. >> >>> >> >>> This attribute is not relevant while executing >> >>> DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET. >> >>> >> >>> By default, auto dump is activated for any reporter that has a dump method, >> >>> as part of the reporter registration to devlink. >> >>> >> >>> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> >> >>> Reviewed-by: Jiri Pirko <jiri@mellanox.com> >> >>> --- >> >>> include/uapi/linux/devlink.h | 2 ++ >> >>> net/core/devlink.c | 26 ++++++++++++++++++++++---- >> >>> 2 files changed, 24 insertions(+), 4 deletions(-) >> >>> >> >>> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h >> >>> index dfdffc42e87d..e7891d1d2ebd 100644 >> >>> --- a/include/uapi/linux/devlink.h >> >>> +++ b/include/uapi/linux/devlink.h >> >>> @@ -429,6 +429,8 @@ enum devlink_attr { >> >>> DEVLINK_ATTR_NETNS_FD, /* u32 */ >> >>> DEVLINK_ATTR_NETNS_PID, /* u32 */ >> >>> DEVLINK_ATTR_NETNS_ID, /* u32 */ >> >>> + >> >>> + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ >> >>> /* add new attributes above here, update the policy in devlink.c */ >> >>> >> >>> __DEVLINK_ATTR_MAX, >> >>> diff --git a/net/core/devlink.c b/net/core/devlink.c >> >>> index ad69379747ef..e14bf3052289 100644 >> >>> --- a/net/core/devlink.c >> >>> +++ b/net/core/devlink.c >> >>> @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { >> >>> struct mutex dump_lock; /* lock parallel read/write from dump buffers */ >> >>> u64 graceful_period; >> >>> bool auto_recover; >> >>> + bool auto_dump; >> >>> u8 health_state; >> >>> u64 dump_ts; >> >>> u64 dump_real_ts; >> >>> @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, >> >>> reporter->devlink = devlink; >> >>> reporter->graceful_period = graceful_period; >> >>> reporter->auto_recover = !!ops->recover; >> >>> + reporter->auto_dump = !!ops->dump; >> >>> mutex_init(&reporter->dump_lock); >> >>> refcount_set(&reporter->refcount, 1); >> >>> list_add_tail(&reporter->list, &devlink->reporter_list); >> >>> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, >> >>> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, >> >>> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) >> >>> goto reporter_nest_cancel; >> >>> + if (reporter->ops->dump && >> >>> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, >> >>> + reporter->auto_dump)) >> >>> + goto reporter_nest_cancel; >> >> >> >> Since you're making it a u8 - does it make sense to indicate to user >> > >> > Please don't be mistaken. u8 carries a bool here. > >Are you okay with limiting the value in the policy? Well, not-0 means true. Do you think it is wise to limit to 0/1? [...]
On Thu, 26 Mar 2020 11:22:44 +0100 Jiri Pirko wrote: > >> >>> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, > >> >>> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, > >> >>> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) > >> >>> goto reporter_nest_cancel; > >> >>> + if (reporter->ops->dump && > >> >>> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, > >> >>> + reporter->auto_dump)) > >> >>> + goto reporter_nest_cancel; > >> >> > >> >> Since you're making it a u8 - does it make sense to indicate to user > >> > > >> > Please don't be mistaken. u8 carries a bool here. > > > >Are you okay with limiting the value in the policy? > > Well, not-0 means true. Do you think it is wise to limit to 0/1? Just future proofing, in general seems wise to always constrain the input as much as possible. But in this case we already have similar attrs in the dump which don't have the constraint, and we will probably want consistency, so maybe we're unlikely to use other values. In particular I was wondering if auto-dump value can be extended to mean the number of dumps we want to collect, the current behavior I think matches collecting just one. But obviously this can be solved with a new attr when needed..
Thu, Mar 26, 2020 at 06:39:13PM CET, kuba@kernel.org wrote: >On Thu, 26 Mar 2020 11:22:44 +0100 Jiri Pirko wrote: >> >> >>> @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, >> >> >>> nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, >> >> >>> reporter->dump_real_ts, DEVLINK_ATTR_PAD)) >> >> >>> goto reporter_nest_cancel; >> >> >>> + if (reporter->ops->dump && >> >> >>> + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, >> >> >>> + reporter->auto_dump)) >> >> >>> + goto reporter_nest_cancel; >> >> >> >> >> >> Since you're making it a u8 - does it make sense to indicate to user >> >> > >> >> > Please don't be mistaken. u8 carries a bool here. >> > >> >Are you okay with limiting the value in the policy? >> >> Well, not-0 means true. Do you think it is wise to limit to 0/1? > >Just future proofing, in general seems wise to always constrain the >input as much as possible. But in this case we already have similar >attrs in the dump which don't have the constraint, and we will probably >want consistency, so maybe we're unlikely to use other values. Agreed. > >In particular I was wondering if auto-dump value can be extended to >mean the number of dumps we want to collect, the current behavior I >think matches collecting just one. But obviously this can be solved >with a new attr when needed..
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index dfdffc42e87d..e7891d1d2ebd 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -429,6 +429,8 @@ enum devlink_attr { DEVLINK_ATTR_NETNS_FD, /* u32 */ DEVLINK_ATTR_NETNS_PID, /* u32 */ DEVLINK_ATTR_NETNS_ID, /* u32 */ + + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index ad69379747ef..e14bf3052289 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4837,6 +4837,7 @@ struct devlink_health_reporter { struct mutex dump_lock; /* lock parallel read/write from dump buffers */ u64 graceful_period; bool auto_recover; + bool auto_dump; u8 health_state; u64 dump_ts; u64 dump_real_ts; @@ -4903,6 +4904,7 @@ devlink_health_reporter_create(struct devlink *devlink, reporter->devlink = devlink; reporter->graceful_period = graceful_period; reporter->auto_recover = !!ops->recover; + reporter->auto_dump = !!ops->dump; mutex_init(&reporter->dump_lock); refcount_set(&reporter->refcount, 1); list_add_tail(&reporter->list, &devlink->reporter_list); @@ -4983,6 +4985,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS, reporter->dump_real_ts, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; + if (reporter->ops->dump && + nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + reporter->auto_dump)) + goto reporter_nest_cancel; nla_nest_end(msg, reporter_attr); genlmsg_end(msg, hdr); @@ -5129,10 +5135,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx, NULL); - mutex_unlock(&reporter->dump_lock); + if (reporter->auto_dump) { + mutex_lock(&reporter->dump_lock); + /* store current dump of current error, for later analysis */ + devlink_health_do_dump(reporter, priv_ctx, NULL); + mutex_unlock(&reporter->dump_lock); + } if (reporter->auto_recover) return devlink_health_reporter_recover(reporter, @@ -5306,6 +5314,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, err = -EOPNOTSUPP; goto out; } + if (!reporter->ops->dump && + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) { + err = -EOPNOTSUPP; + goto out; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) reporter->graceful_period = @@ -5315,6 +5328,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, reporter->auto_recover = nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) + reporter->auto_dump = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]); + devlink_health_reporter_put(reporter); return 0; out: @@ -6053,6 +6070,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 }, [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING },