diff mbox series

[net-next,5/5] net: improve napi threaded config

Message ID 20200930192140.4192859-6-weiwan@google.com
State Changes Requested
Delegated to: David Miller
Headers show
Series implement kthread based napi poll | expand

Commit Message

Wei Wang Sept. 30, 2020, 7:21 p.m. UTC
This commit mainly addresses the threaded config to make the switch
between softirq based and kthread based NAPI processing not require
a device down/up.
It also moves the kthread_create() call to the sysfs handler when user
tries to enable "threaded" on napi, and properly handles the
kthread_create() failure. This is because certain drivers do not have
the napi created and linked to the dev when dev_open() is called. So
the previous implementation does not work properly there.

Signed-off-by: Wei Wang <weiwan@google.com>
---
Changes since RFC:
changed the thread name to napi/<dev>-<napi-id>

 net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
 net/core/net-sysfs.c |  9 +++-----
 2 files changed, 31 insertions(+), 27 deletions(-)

Comments

Felix Fietkau Oct. 1, 2020, 10:01 a.m. UTC | #1
On 2020-09-30 21:21, Wei Wang wrote:
> This commit mainly addresses the threaded config to make the switch
> between softirq based and kthread based NAPI processing not require
> a device down/up.
> It also moves the kthread_create() call to the sysfs handler when user
> tries to enable "threaded" on napi, and properly handles the
> kthread_create() failure. This is because certain drivers do not have
> the napi created and linked to the dev when dev_open() is called. So
> the previous implementation does not work properly there.
> 
> Signed-off-by: Wei Wang <weiwan@google.com>
> ---
> Changes since RFC:
> changed the thread name to napi/<dev>-<napi-id>
> 
>  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
>  net/core/net-sysfs.c |  9 +++-----
>  2 files changed, 31 insertions(+), 27 deletions(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index b4f33e442b5e..bf878d3a9d89 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
>  
>  static int napi_threaded_poll(void *data);
>  
> -static void napi_thread_start(struct napi_struct *n)
> +static int napi_kthread_create(struct napi_struct *n)
>  {
> -	if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
> -		n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
> -					   n->dev->name, n->napi_id);
> +	int err = 0;
> +
> +	n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
> +				   n->dev->name, n->napi_id);
> +	if (IS_ERR(n->thread)) {
> +		err = PTR_ERR(n->thread);
> +		pr_err("kthread_create failed with err %d\n", err);
> +		n->thread = NULL;
> +	}
> +
> +	return err;
If I remember correctly, using kthread_create with no explicit first
wakeup means the task will sit there and contribute to system loadavg
until it is woken up the first time.
Shouldn't we use kthread_run here instead?

- Felix
Wei Wang Oct. 1, 2020, 5:01 p.m. UTC | #2
On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
>
>
> On 2020-09-30 21:21, Wei Wang wrote:
> > This commit mainly addresses the threaded config to make the switch
> > between softirq based and kthread based NAPI processing not require
> > a device down/up.
> > It also moves the kthread_create() call to the sysfs handler when user
> > tries to enable "threaded" on napi, and properly handles the
> > kthread_create() failure. This is because certain drivers do not have
> > the napi created and linked to the dev when dev_open() is called. So
> > the previous implementation does not work properly there.
> >
> > Signed-off-by: Wei Wang <weiwan@google.com>
> > ---
> > Changes since RFC:
> > changed the thread name to napi/<dev>-<napi-id>
> >
> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
> >  net/core/net-sysfs.c |  9 +++-----
> >  2 files changed, 31 insertions(+), 27 deletions(-)
> >
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index b4f33e442b5e..bf878d3a9d89 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
> >
> >  static int napi_threaded_poll(void *data);
> >
> > -static void napi_thread_start(struct napi_struct *n)
> > +static int napi_kthread_create(struct napi_struct *n)
> >  {
> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
> > -                                        n->dev->name, n->napi_id);
> > +     int err = 0;
> > +
> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
> > +                                n->dev->name, n->napi_id);
> > +     if (IS_ERR(n->thread)) {
> > +             err = PTR_ERR(n->thread);
> > +             pr_err("kthread_create failed with err %d\n", err);
> > +             n->thread = NULL;
> > +     }
> > +
> > +     return err;
> If I remember correctly, using kthread_create with no explicit first
> wakeup means the task will sit there and contribute to system loadavg
> until it is woken up the first time.
> Shouldn't we use kthread_run here instead?
>

Right. kthread_create() basically creates the thread and leaves it in
sleep mode. I think that is what we want. We rely on the next
___napi_schedule() call to wake up this thread when there is work to
do.

> - Felix
Felix Fietkau Oct. 1, 2020, 5:11 p.m. UTC | #3
On 2020-10-01 19:01, Wei Wang wrote:
> On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
>>
>>
>> On 2020-09-30 21:21, Wei Wang wrote:
>> > This commit mainly addresses the threaded config to make the switch
>> > between softirq based and kthread based NAPI processing not require
>> > a device down/up.
>> > It also moves the kthread_create() call to the sysfs handler when user
>> > tries to enable "threaded" on napi, and properly handles the
>> > kthread_create() failure. This is because certain drivers do not have
>> > the napi created and linked to the dev when dev_open() is called. So
>> > the previous implementation does not work properly there.
>> >
>> > Signed-off-by: Wei Wang <weiwan@google.com>
>> > ---
>> > Changes since RFC:
>> > changed the thread name to napi/<dev>-<napi-id>
>> >
>> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
>> >  net/core/net-sysfs.c |  9 +++-----
>> >  2 files changed, 31 insertions(+), 27 deletions(-)
>> >
>> > diff --git a/net/core/dev.c b/net/core/dev.c
>> > index b4f33e442b5e..bf878d3a9d89 100644
>> > --- a/net/core/dev.c
>> > +++ b/net/core/dev.c
>> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
>> >
>> >  static int napi_threaded_poll(void *data);
>> >
>> > -static void napi_thread_start(struct napi_struct *n)
>> > +static int napi_kthread_create(struct napi_struct *n)
>> >  {
>> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
>> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
>> > -                                        n->dev->name, n->napi_id);
>> > +     int err = 0;
>> > +
>> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
>> > +                                n->dev->name, n->napi_id);
>> > +     if (IS_ERR(n->thread)) {
>> > +             err = PTR_ERR(n->thread);
>> > +             pr_err("kthread_create failed with err %d\n", err);
>> > +             n->thread = NULL;
>> > +     }
>> > +
>> > +     return err;
>> If I remember correctly, using kthread_create with no explicit first
>> wakeup means the task will sit there and contribute to system loadavg
>> until it is woken up the first time.
>> Shouldn't we use kthread_run here instead?
>>
> 
> Right. kthread_create() basically creates the thread and leaves it in
> sleep mode. I think that is what we want. We rely on the next
> ___napi_schedule() call to wake up this thread when there is work to
> do.
But what if you have a device that's basically idle and napi isn't
scheduled until much later? It will get a confusing loadavg until then.
I'd prefer waking up the thread immediately and filtering going back to
sleep once in the thread function before running the loop if
NAPI_STATE_SCHED wasn't set.

- Felix
Eric Dumazet Oct. 1, 2020, 6:03 p.m. UTC | #4
On Thu, Oct 1, 2020 at 7:12 PM Felix Fietkau <nbd@nbd.name> wrote:
>
> On 2020-10-01 19:01, Wei Wang wrote:
> > On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
> >>
> >>
> >> On 2020-09-30 21:21, Wei Wang wrote:
> >> > This commit mainly addresses the threaded config to make the switch
> >> > between softirq based and kthread based NAPI processing not require
> >> > a device down/up.
> >> > It also moves the kthread_create() call to the sysfs handler when user
> >> > tries to enable "threaded" on napi, and properly handles the
> >> > kthread_create() failure. This is because certain drivers do not have
> >> > the napi created and linked to the dev when dev_open() is called. So
> >> > the previous implementation does not work properly there.
> >> >
> >> > Signed-off-by: Wei Wang <weiwan@google.com>
> >> > ---
> >> > Changes since RFC:
> >> > changed the thread name to napi/<dev>-<napi-id>
> >> >
> >> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
> >> >  net/core/net-sysfs.c |  9 +++-----
> >> >  2 files changed, 31 insertions(+), 27 deletions(-)
> >> >
> >> > diff --git a/net/core/dev.c b/net/core/dev.c
> >> > index b4f33e442b5e..bf878d3a9d89 100644
> >> > --- a/net/core/dev.c
> >> > +++ b/net/core/dev.c
> >> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
> >> >
> >> >  static int napi_threaded_poll(void *data);
> >> >
> >> > -static void napi_thread_start(struct napi_struct *n)
> >> > +static int napi_kthread_create(struct napi_struct *n)
> >> >  {
> >> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
> >> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
> >> > -                                        n->dev->name, n->napi_id);
> >> > +     int err = 0;
> >> > +
> >> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
> >> > +                                n->dev->name, n->napi_id);
> >> > +     if (IS_ERR(n->thread)) {
> >> > +             err = PTR_ERR(n->thread);
> >> > +             pr_err("kthread_create failed with err %d\n", err);
> >> > +             n->thread = NULL;
> >> > +     }
> >> > +
> >> > +     return err;
> >> If I remember correctly, using kthread_create with no explicit first
> >> wakeup means the task will sit there and contribute to system loadavg
> >> until it is woken up the first time.
> >> Shouldn't we use kthread_run here instead?
> >>
> >
> > Right. kthread_create() basically creates the thread and leaves it in
> > sleep mode. I think that is what we want. We rely on the next
> > ___napi_schedule() call to wake up this thread when there is work to
> > do.
> But what if you have a device that's basically idle and napi isn't
> scheduled until much later? It will get a confusing loadavg until then.
> I'd prefer waking up the thread immediately and filtering going back to
> sleep once in the thread function before running the loop if
> NAPI_STATE_SCHED wasn't set.
>

I was not aware of this kthread_create() impact on loadavg.
This seems like a bug to me. (although I do not care about loadavg)

Do you have pointers on some documentation ?

Probably not a big deal, but this seems quite odd to me.
Felix Fietkau Oct. 1, 2020, 6:37 p.m. UTC | #5
On 2020-10-01 20:03, Eric Dumazet wrote:
> On Thu, Oct 1, 2020 at 7:12 PM Felix Fietkau <nbd@nbd.name> wrote:
>>
>> On 2020-10-01 19:01, Wei Wang wrote:
>> > On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
>> >>
>> >>
>> >> On 2020-09-30 21:21, Wei Wang wrote:
>> >> > This commit mainly addresses the threaded config to make the switch
>> >> > between softirq based and kthread based NAPI processing not require
>> >> > a device down/up.
>> >> > It also moves the kthread_create() call to the sysfs handler when user
>> >> > tries to enable "threaded" on napi, and properly handles the
>> >> > kthread_create() failure. This is because certain drivers do not have
>> >> > the napi created and linked to the dev when dev_open() is called. So
>> >> > the previous implementation does not work properly there.
>> >> >
>> >> > Signed-off-by: Wei Wang <weiwan@google.com>
>> >> > ---
>> >> > Changes since RFC:
>> >> > changed the thread name to napi/<dev>-<napi-id>
>> >> >
>> >> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
>> >> >  net/core/net-sysfs.c |  9 +++-----
>> >> >  2 files changed, 31 insertions(+), 27 deletions(-)
>> >> >
>> >> > diff --git a/net/core/dev.c b/net/core/dev.c
>> >> > index b4f33e442b5e..bf878d3a9d89 100644
>> >> > --- a/net/core/dev.c
>> >> > +++ b/net/core/dev.c
>> >> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
>> >> >
>> >> >  static int napi_threaded_poll(void *data);
>> >> >
>> >> > -static void napi_thread_start(struct napi_struct *n)
>> >> > +static int napi_kthread_create(struct napi_struct *n)
>> >> >  {
>> >> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
>> >> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
>> >> > -                                        n->dev->name, n->napi_id);
>> >> > +     int err = 0;
>> >> > +
>> >> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
>> >> > +                                n->dev->name, n->napi_id);
>> >> > +     if (IS_ERR(n->thread)) {
>> >> > +             err = PTR_ERR(n->thread);
>> >> > +             pr_err("kthread_create failed with err %d\n", err);
>> >> > +             n->thread = NULL;
>> >> > +     }
>> >> > +
>> >> > +     return err;
>> >> If I remember correctly, using kthread_create with no explicit first
>> >> wakeup means the task will sit there and contribute to system loadavg
>> >> until it is woken up the first time.
>> >> Shouldn't we use kthread_run here instead?
>> >>
>> >
>> > Right. kthread_create() basically creates the thread and leaves it in
>> > sleep mode. I think that is what we want. We rely on the next
>> > ___napi_schedule() call to wake up this thread when there is work to
>> > do.
>> But what if you have a device that's basically idle and napi isn't
>> scheduled until much later? It will get a confusing loadavg until then.
>> I'd prefer waking up the thread immediately and filtering going back to
>> sleep once in the thread function before running the loop if
>> NAPI_STATE_SCHED wasn't set.
>>
> 
> I was not aware of this kthread_create() impact on loadavg.
> This seems like a bug to me. (although I do not care about loadavg)
> 
> Do you have pointers on some documentation ?
I don't have any specific documentation pointers, but this is something
I observed on several occasions when playing with kthreads.

From what I can find in the loadavg code it seems that tasks in
TASK_UNINTERRUPTIBLE state are counted for loadavg alongside actually
runnable tasks. This seems intentional to me, but I don't know why it
was made like this.

A kthread does not start the thread function until it has been woken up
at least once, most likely to give the creating code a chance to perform
some initializations after successfully creating the thread, before the
thread function starts doing something. Instead, kthread() sets
TASK_UNINTERRUPTIBLE and calls schedule() once.

> Probably not a big deal, but this seems quite odd to me.
I've run into enough users that look at loadavg as a measure of system
load and would likely start reporting bugs if they observe such
behavior. I'd like to avoid that.

- Felix
Wei Wang Oct. 1, 2020, 7:24 p.m. UTC | #6
On Thu, Oct 1, 2020 at 11:38 AM Felix Fietkau <nbd@nbd.name> wrote:
>
>
> On 2020-10-01 20:03, Eric Dumazet wrote:
> > On Thu, Oct 1, 2020 at 7:12 PM Felix Fietkau <nbd@nbd.name> wrote:
> >>
> >> On 2020-10-01 19:01, Wei Wang wrote:
> >> > On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
> >> >>
> >> >>
> >> >> On 2020-09-30 21:21, Wei Wang wrote:
> >> >> > This commit mainly addresses the threaded config to make the switch
> >> >> > between softirq based and kthread based NAPI processing not require
> >> >> > a device down/up.
> >> >> > It also moves the kthread_create() call to the sysfs handler when user
> >> >> > tries to enable "threaded" on napi, and properly handles the
> >> >> > kthread_create() failure. This is because certain drivers do not have
> >> >> > the napi created and linked to the dev when dev_open() is called. So
> >> >> > the previous implementation does not work properly there.
> >> >> >
> >> >> > Signed-off-by: Wei Wang <weiwan@google.com>
> >> >> > ---
> >> >> > Changes since RFC:
> >> >> > changed the thread name to napi/<dev>-<napi-id>
> >> >> >
> >> >> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
> >> >> >  net/core/net-sysfs.c |  9 +++-----
> >> >> >  2 files changed, 31 insertions(+), 27 deletions(-)
> >> >> >
> >> >> > diff --git a/net/core/dev.c b/net/core/dev.c
> >> >> > index b4f33e442b5e..bf878d3a9d89 100644
> >> >> > --- a/net/core/dev.c
> >> >> > +++ b/net/core/dev.c
> >> >> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
> >> >> >
> >> >> >  static int napi_threaded_poll(void *data);
> >> >> >
> >> >> > -static void napi_thread_start(struct napi_struct *n)
> >> >> > +static int napi_kthread_create(struct napi_struct *n)
> >> >> >  {
> >> >> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
> >> >> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
> >> >> > -                                        n->dev->name, n->napi_id);
> >> >> > +     int err = 0;
> >> >> > +
> >> >> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
> >> >> > +                                n->dev->name, n->napi_id);
> >> >> > +     if (IS_ERR(n->thread)) {
> >> >> > +             err = PTR_ERR(n->thread);
> >> >> > +             pr_err("kthread_create failed with err %d\n", err);
> >> >> > +             n->thread = NULL;
> >> >> > +     }
> >> >> > +
> >> >> > +     return err;
> >> >> If I remember correctly, using kthread_create with no explicit first
> >> >> wakeup means the task will sit there and contribute to system loadavg
> >> >> until it is woken up the first time.
> >> >> Shouldn't we use kthread_run here instead?
> >> >>
> >> >
> >> > Right. kthread_create() basically creates the thread and leaves it in
> >> > sleep mode. I think that is what we want. We rely on the next
> >> > ___napi_schedule() call to wake up this thread when there is work to
> >> > do.
> >> But what if you have a device that's basically idle and napi isn't
> >> scheduled until much later? It will get a confusing loadavg until then.
> >> I'd prefer waking up the thread immediately and filtering going back to
> >> sleep once in the thread function before running the loop if
> >> NAPI_STATE_SCHED wasn't set.
> >>
> >
> > I was not aware of this kthread_create() impact on loadavg.
> > This seems like a bug to me. (although I do not care about loadavg)
> >
> > Do you have pointers on some documentation ?

I found this link:
http://www.brendangregg.com/blog/2017-08-08/linux-load-averages.html
It has a section called "Linux Uninterruptible Tasks" which explains
this behavior specifically. But I don't see a good conclusion on why.
Seems to be a convention.
IMHO, this is actually the problem/decision of the loadavg. It should
not impact how the kernel code is implemented. I think it makes more
sense to only wake up the thread when there is work to do.

> I don't have any specific documentation pointers, but this is something
> I observed on several occasions when playing with kthreads.
>
> From what I can find in the loadavg code it seems that tasks in
> TASK_UNINTERRUPTIBLE state are counted for loadavg alongside actually
> runnable tasks. This seems intentional to me, but I don't know why it
> was made like this.
>
> A kthread does not start the thread function until it has been woken up
> at least once, most likely to give the creating code a chance to perform
> some initializations after successfully creating the thread, before the
> thread function starts doing something. Instead, kthread() sets
> TASK_UNINTERRUPTIBLE and calls schedule() once.
>
> > Probably not a big deal, but this seems quite odd to me.
> I've run into enough users that look at loadavg as a measure of system
> load and would likely start reporting bugs if they observe such
> behavior. I'd like to avoid that.
>
> - Felix
Felix Fietkau Oct. 1, 2020, 8:48 p.m. UTC | #7
On 2020-10-01 21:24, Wei Wang wrote:
> On Thu, Oct 1, 2020 at 11:38 AM Felix Fietkau <nbd@nbd.name> wrote:
>>
>>
>> On 2020-10-01 20:03, Eric Dumazet wrote:
>> > On Thu, Oct 1, 2020 at 7:12 PM Felix Fietkau <nbd@nbd.name> wrote:
>> >>
>> >> On 2020-10-01 19:01, Wei Wang wrote:
>> >> > On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
>> >> >>
>> >> >>
>> >> >> On 2020-09-30 21:21, Wei Wang wrote:
>> >> >> > This commit mainly addresses the threaded config to make the switch
>> >> >> > between softirq based and kthread based NAPI processing not require
>> >> >> > a device down/up.
>> >> >> > It also moves the kthread_create() call to the sysfs handler when user
>> >> >> > tries to enable "threaded" on napi, and properly handles the
>> >> >> > kthread_create() failure. This is because certain drivers do not have
>> >> >> > the napi created and linked to the dev when dev_open() is called. So
>> >> >> > the previous implementation does not work properly there.
>> >> >> >
>> >> >> > Signed-off-by: Wei Wang <weiwan@google.com>
>> >> >> > ---
>> >> >> > Changes since RFC:
>> >> >> > changed the thread name to napi/<dev>-<napi-id>
>> >> >> >
>> >> >> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
>> >> >> >  net/core/net-sysfs.c |  9 +++-----
>> >> >> >  2 files changed, 31 insertions(+), 27 deletions(-)
>> >> >> >
>> >> >> > diff --git a/net/core/dev.c b/net/core/dev.c
>> >> >> > index b4f33e442b5e..bf878d3a9d89 100644
>> >> >> > --- a/net/core/dev.c
>> >> >> > +++ b/net/core/dev.c
>> >> >> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
>> >> >> >
>> >> >> >  static int napi_threaded_poll(void *data);
>> >> >> >
>> >> >> > -static void napi_thread_start(struct napi_struct *n)
>> >> >> > +static int napi_kthread_create(struct napi_struct *n)
>> >> >> >  {
>> >> >> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
>> >> >> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
>> >> >> > -                                        n->dev->name, n->napi_id);
>> >> >> > +     int err = 0;
>> >> >> > +
>> >> >> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
>> >> >> > +                                n->dev->name, n->napi_id);
>> >> >> > +     if (IS_ERR(n->thread)) {
>> >> >> > +             err = PTR_ERR(n->thread);
>> >> >> > +             pr_err("kthread_create failed with err %d\n", err);
>> >> >> > +             n->thread = NULL;
>> >> >> > +     }
>> >> >> > +
>> >> >> > +     return err;
>> >> >> If I remember correctly, using kthread_create with no explicit first
>> >> >> wakeup means the task will sit there and contribute to system loadavg
>> >> >> until it is woken up the first time.
>> >> >> Shouldn't we use kthread_run here instead?
>> >> >>
>> >> >
>> >> > Right. kthread_create() basically creates the thread and leaves it in
>> >> > sleep mode. I think that is what we want. We rely on the next
>> >> > ___napi_schedule() call to wake up this thread when there is work to
>> >> > do.
>> >> But what if you have a device that's basically idle and napi isn't
>> >> scheduled until much later? It will get a confusing loadavg until then.
>> >> I'd prefer waking up the thread immediately and filtering going back to
>> >> sleep once in the thread function before running the loop if
>> >> NAPI_STATE_SCHED wasn't set.
>> >>
>> >
>> > I was not aware of this kthread_create() impact on loadavg.
>> > This seems like a bug to me. (although I do not care about loadavg)
>> >
>> > Do you have pointers on some documentation ?
> 
> I found this link:
> http://www.brendangregg.com/blog/2017-08-08/linux-load-averages.html
> It has a section called "Linux Uninterruptible Tasks" which explains
> this behavior specifically. But I don't see a good conclusion on why.
> Seems to be a convention.
> IMHO, this is actually the problem/decision of the loadavg. It should
> not impact how the kernel code is implemented. I think it makes more
> sense to only wake up the thread when there is work to do.
There were other users of kthread where the same issue was fixed.
With a quick search, I found these commits:
e890591413819eeb604207ad3261ba617b2ec0bb
3f776e8a25a9d281125490562e1cc5bd7c14cf7c

Please note that one of these describes that a kthread that was created
but not woken was triggering a blocked task warning - so it's not just
the loadavg that matters here.

All the other users of kthread that I looked at also do an initial
wakeup of the thread. Not doing it seems like wrong use of the API to me.

- Felix
Wei Wang Oct. 1, 2020, 10:42 p.m. UTC | #8
On Thu, Oct 1, 2020 at 1:48 PM Felix Fietkau <nbd@nbd.name> wrote:
>
>
> On 2020-10-01 21:24, Wei Wang wrote:
> > On Thu, Oct 1, 2020 at 11:38 AM Felix Fietkau <nbd@nbd.name> wrote:
> >>
> >>
> >> On 2020-10-01 20:03, Eric Dumazet wrote:
> >> > On Thu, Oct 1, 2020 at 7:12 PM Felix Fietkau <nbd@nbd.name> wrote:
> >> >>
> >> >> On 2020-10-01 19:01, Wei Wang wrote:
> >> >> > On Thu, Oct 1, 2020 at 3:01 AM Felix Fietkau <nbd@nbd.name> wrote:
> >> >> >>
> >> >> >>
> >> >> >> On 2020-09-30 21:21, Wei Wang wrote:
> >> >> >> > This commit mainly addresses the threaded config to make the switch
> >> >> >> > between softirq based and kthread based NAPI processing not require
> >> >> >> > a device down/up.
> >> >> >> > It also moves the kthread_create() call to the sysfs handler when user
> >> >> >> > tries to enable "threaded" on napi, and properly handles the
> >> >> >> > kthread_create() failure. This is because certain drivers do not have
> >> >> >> > the napi created and linked to the dev when dev_open() is called. So
> >> >> >> > the previous implementation does not work properly there.
> >> >> >> >
> >> >> >> > Signed-off-by: Wei Wang <weiwan@google.com>
> >> >> >> > ---
> >> >> >> > Changes since RFC:
> >> >> >> > changed the thread name to napi/<dev>-<napi-id>
> >> >> >> >
> >> >> >> >  net/core/dev.c       | 49 +++++++++++++++++++++++++-------------------
> >> >> >> >  net/core/net-sysfs.c |  9 +++-----
> >> >> >> >  2 files changed, 31 insertions(+), 27 deletions(-)
> >> >> >> >
> >> >> >> > diff --git a/net/core/dev.c b/net/core/dev.c
> >> >> >> > index b4f33e442b5e..bf878d3a9d89 100644
> >> >> >> > --- a/net/core/dev.c
> >> >> >> > +++ b/net/core/dev.c
> >> >> >> > @@ -1490,17 +1490,24 @@ EXPORT_SYMBOL(netdev_notify_peers);
> >> >> >> >
> >> >> >> >  static int napi_threaded_poll(void *data);
> >> >> >> >
> >> >> >> > -static void napi_thread_start(struct napi_struct *n)
> >> >> >> > +static int napi_kthread_create(struct napi_struct *n)
> >> >> >> >  {
> >> >> >> > -     if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
> >> >> >> > -             n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
> >> >> >> > -                                        n->dev->name, n->napi_id);
> >> >> >> > +     int err = 0;
> >> >> >> > +
> >> >> >> > +     n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
> >> >> >> > +                                n->dev->name, n->napi_id);
> >> >> >> > +     if (IS_ERR(n->thread)) {
> >> >> >> > +             err = PTR_ERR(n->thread);
> >> >> >> > +             pr_err("kthread_create failed with err %d\n", err);
> >> >> >> > +             n->thread = NULL;
> >> >> >> > +     }
> >> >> >> > +
> >> >> >> > +     return err;
> >> >> >> If I remember correctly, using kthread_create with no explicit first
> >> >> >> wakeup means the task will sit there and contribute to system loadavg
> >> >> >> until it is woken up the first time.
> >> >> >> Shouldn't we use kthread_run here instead?
> >> >> >>
> >> >> >
> >> >> > Right. kthread_create() basically creates the thread and leaves it in
> >> >> > sleep mode. I think that is what we want. We rely on the next
> >> >> > ___napi_schedule() call to wake up this thread when there is work to
> >> >> > do.
> >> >> But what if you have a device that's basically idle and napi isn't
> >> >> scheduled until much later? It will get a confusing loadavg until then.
> >> >> I'd prefer waking up the thread immediately and filtering going back to
> >> >> sleep once in the thread function before running the loop if
> >> >> NAPI_STATE_SCHED wasn't set.
> >> >>
> >> >
> >> > I was not aware of this kthread_create() impact on loadavg.
> >> > This seems like a bug to me. (although I do not care about loadavg)
> >> >
> >> > Do you have pointers on some documentation ?
> >
> > I found this link:
> > http://www.brendangregg.com/blog/2017-08-08/linux-load-averages.html
> > It has a section called "Linux Uninterruptible Tasks" which explains
> > this behavior specifically. But I don't see a good conclusion on why.
> > Seems to be a convention.
> > IMHO, this is actually the problem/decision of the loadavg. It should
> > not impact how the kernel code is implemented. I think it makes more
> > sense to only wake up the thread when there is work to do.
> There were other users of kthread where the same issue was fixed.
> With a quick search, I found these commits:
> e890591413819eeb604207ad3261ba617b2ec0bb
> 3f776e8a25a9d281125490562e1cc5bd7c14cf7c
>
> Please note that one of these describes that a kthread that was created
> but not woken was triggering a blocked task warning - so it's not just
> the loadavg that matters here.
>
> All the other users of kthread that I looked at also do an initial
> wakeup of the thread. Not doing it seems like wrong use of the API to me.
>

Thanks Felix for digging up the above commits. Very helpful. I will
change it to kthread_run() in v2.

> - Felix
diff mbox series

Patch

diff --git a/net/core/dev.c b/net/core/dev.c
index b4f33e442b5e..bf878d3a9d89 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1490,17 +1490,24 @@  EXPORT_SYMBOL(netdev_notify_peers);
 
 static int napi_threaded_poll(void *data);
 
-static void napi_thread_start(struct napi_struct *n)
+static int napi_kthread_create(struct napi_struct *n)
 {
-	if (test_bit(NAPI_STATE_THREADED, &n->state) && !n->thread)
-		n->thread = kthread_create(napi_threaded_poll, n, "%s-%d",
-					   n->dev->name, n->napi_id);
+	int err = 0;
+
+	n->thread = kthread_create(napi_threaded_poll, n, "napi/%s-%d",
+				   n->dev->name, n->napi_id);
+	if (IS_ERR(n->thread)) {
+		err = PTR_ERR(n->thread);
+		pr_err("kthread_create failed with err %d\n", err);
+		n->thread = NULL;
+	}
+
+	return err;
 }
 
 static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	struct napi_struct *n;
 	int ret;
 
 	ASSERT_RTNL();
@@ -1532,9 +1539,6 @@  static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 	if (!ret && ops->ndo_open)
 		ret = ops->ndo_open(dev);
 
-	list_for_each_entry(n, &dev->napi_list, dev_list)
-		napi_thread_start(n);
-
 	netpoll_poll_enable(dev);
 
 	if (ret)
@@ -1585,6 +1589,7 @@  static void napi_thread_stop(struct napi_struct *n)
 	if (!n->thread)
 		return;
 	kthread_stop(n->thread);
+	clear_bit(NAPI_STATE_THREADED, &n->state);
 	n->thread = NULL;
 }
 
@@ -4267,7 +4272,7 @@  int gro_normal_batch __read_mostly = 8;
 static inline void ____napi_schedule(struct softnet_data *sd,
 				     struct napi_struct *napi)
 {
-	if (napi->thread) {
+	if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
 		wake_up_process(napi->thread);
 		return;
 	}
@@ -6687,25 +6692,25 @@  static void init_gro_hash(struct napi_struct *napi)
 
 int napi_set_threaded(struct napi_struct *n, bool threaded)
 {
-	ASSERT_RTNL();
+	int err = 0;
 
-	if (n->dev->flags & IFF_UP)
-		return -EBUSY;
+	ASSERT_RTNL();
 
 	if (threaded == !!test_bit(NAPI_STATE_THREADED, &n->state))
 		return 0;
-	if (threaded)
+	if (threaded) {
+		if (!n->thread) {
+			err = napi_kthread_create(n);
+			if (err)
+				goto out;
+		}
 		set_bit(NAPI_STATE_THREADED, &n->state);
-	else
+	} else {
 		clear_bit(NAPI_STATE_THREADED, &n->state);
+	}
 
-	/* if the device is initializing, nothing todo */
-	if (test_bit(__LINK_STATE_START, &n->dev->state))
-		return 0;
-
-	napi_thread_stop(n);
-	napi_thread_start(n);
-	return 0;
+out:
+	return err;
 }
 EXPORT_SYMBOL(napi_set_threaded);
 
@@ -6750,6 +6755,7 @@  void napi_disable(struct napi_struct *n)
 		msleep(1);
 
 	hrtimer_cancel(&n->timer);
+	napi_thread_stop(n);
 
 	clear_bit(NAPI_STATE_DISABLE, &n->state);
 }
@@ -6870,6 +6876,7 @@  static int napi_thread_wait(struct napi_struct *napi)
 
 	while (!kthread_should_stop() && !napi_disable_pending(napi)) {
 		if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+			WARN_ON(!list_empty(&napi->poll_list));
 			__set_current_state(TASK_RUNNING);
 			return 0;
 		}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index fe81b344447d..b54dbccf00be 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -609,11 +609,6 @@  static ssize_t threaded_store(struct device *dev,
 		goto unlock;
 	}
 
-	if (netdev->flags & IFF_UP) {
-		ret = -EBUSY;
-		goto unlock;
-	}
-
 	bmap = __alloc_thread_bitmap(netdev, &bits);
 	if (!bmap) {
 		ret = -ENOMEM;
@@ -626,7 +621,9 @@  static ssize_t threaded_store(struct device *dev,
 
 	i = 0;
 	list_for_each_entry(n, &netdev->napi_list, dev_list) {
-		napi_set_threaded(n, test_bit(i, bmap));
+		ret = napi_set_threaded(n, test_bit(i, bmap));
+		if (ret)
+			goto free_unlock;
 		i++;
 	}
 	ret = len;