diff mbox series

[v2] linux: fix accuracy of get_nprocs and get_nprocs_conf [BZ #28865]

Message ID 20220207135736.GA31310@altlinux.org
State New
Headers show
Series [v2] linux: fix accuracy of get_nprocs and get_nprocs_conf [BZ #28865] | expand

Commit Message

Dmitry V. Levin Feb. 7, 2022, 1:57 p.m. UTC
get_nprocs() and get_nprocs_conf() use various methods to obtain an
accurate number of processors.  Re-introduce __get_nprocs_sched() as
a source of information, and fix the order in which these methods are
used to return the most accurate information.  The primary source of
information used in both functions remains unchanged.

This also changes __get_nprocs_sched() error return value from 2 to 0,
but all its users are already prepared to handle that.

Old behavior:
  get_nprocs:
    /sys/devices/system/cpu/online -> /proc/stat -> 2
  get_nprocs_conf:
    /sys/devices/system/cpu/ -> /proc/stat -> 2

New behavior:
  get_nprocs:
    /sys/devices/system/cpu/online -> /proc/stat -> sched_getaffinity -> 2
  get_nprocs_conf:
    /sys/devices/system/cpu/ -> /proc/stat -> sched_getaffinity -> 2

Fixes: 342298278e ("linux: Revert the use of sched_getaffinity on get_nproc")
Closes: BZ #28865
---

v2: prioritize /proc/stat over sched_getaffinity in get_nprocs().

 sysdeps/unix/sysv/linux/getsysstats.c | 94 ++++++++++++++++++---------
 1 file changed, 63 insertions(+), 31 deletions(-)

Comments

Adhemerval Zanella Feb. 8, 2022, 7:34 p.m. UTC | #1
On 07/02/2022 10:57, Dmitry V. Levin wrote:
> get_nprocs() and get_nprocs_conf() use various methods to obtain an
> accurate number of processors.  Re-introduce __get_nprocs_sched() as
> a source of information, and fix the order in which these methods are
> used to return the most accurate information.  The primary source of
> information used in both functions remains unchanged.
> 
> This also changes __get_nprocs_sched() error return value from 2 to 0,
> but all its users are already prepared to handle that.
> 
> Old behavior:
>   get_nprocs:
>     /sys/devices/system/cpu/online -> /proc/stat -> 2
>   get_nprocs_conf:
>     /sys/devices/system/cpu/ -> /proc/stat -> 2
> 
> New behavior:
>   get_nprocs:
>     /sys/devices/system/cpu/online -> /proc/stat -> sched_getaffinity -> 2
>   get_nprocs_conf:
>     /sys/devices/system/cpu/ -> /proc/stat -> sched_getaffinity -> 2
> 
> Fixes: 342298278e ("linux: Revert the use of sched_getaffinity on get_nproc")
> Closes: BZ #28865

I am still not fully sure if sched_getaffinity is a correct fallback to an 
API that should return a system overview, the issue that lead 
sched_getaffinity removal was that it is subject to per-process filtering
(either by seccomp, cgroup, etc.) and it might trigger some wrong behavior
in some programs  (such as monitoring tools and jvms).

However if the environment does not provide a way to actually obtain such
information I guess sched_getaffinity should not make things worse (it does
not make sense to assume multiprocessor if the process is not allowed more
than one CPU, and monitoring tools should not work if sysfs/procfs are
not present).

LGTM with some nits below.  I think you might use brackets instead of
square bracket in title (to trigger the bugzilla scripts).

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
> 
> v2: prioritize /proc/stat over sched_getaffinity in get_nprocs().
> 
>  sysdeps/unix/sysv/linux/getsysstats.c | 94 ++++++++++++++++++---------
>  1 file changed, 63 insertions(+), 31 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
> index c98c8ce3d4..ef77ed193c 100644
> --- a/sysdeps/unix/sysv/linux/getsysstats.c
> +++ b/sysdeps/unix/sysv/linux/getsysstats.c
> @@ -50,9 +50,8 @@ __get_nprocs_sched (void)
>         is an arbitrary values assuming such systems should be rare and there
>         is no offline cpus.  */
>      return max_num_cpus;
> -  /* Some other error.  2 is conservative (not a uniprocessor system, so
> -     atomics are needed). */
> -  return 2;
> +  /* Some other error.  */
> +  return 0;
>  }
>  
>  static char *
> @@ -108,22 +107,19 @@ next_line (int fd, char *const buffer, char **cp, char **re,
>  }
>  
>  static int
> -get_nproc_stat (char *buffer, size_t buffer_size)
> +get_nproc_stat (void)
>  {
> +  enum { buffer_size = 1024 };
> +  char buffer[buffer_size];
>    char *buffer_end = buffer + buffer_size;
>    char *cp = buffer_end;
>    char *re = buffer_end;
> -
> -  /* Default to an SMP system in case we cannot obtain an accurate
> -     number.  */
> -  int result = 2;
> +  int result = 0;
>  
>    const int flags = O_RDONLY | O_CLOEXEC;
>    int fd = __open_nocancel ("/proc/stat", flags);
>    if (fd != -1)
>      {
> -      result = 0;
> -
>        char *l;
>        while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL)
>  	/* The current format of /proc/stat has all the cpu* entries
> @@ -139,8 +135,8 @@ get_nproc_stat (char *buffer, size_t buffer_size)
>    return result;
>  }
>  
> -int
> -__get_nprocs (void)
> +static int
> +get_nprocs_cpu_online (void)
>  {
>    enum { buffer_size = 1024 };
>    char buffer[buffer_size];
> @@ -179,7 +175,8 @@ __get_nprocs (void)
>  		  }
>  	      }
>  
> -	    result += m - n + 1;
> +	    if (m >= n)
> +	      result += m - n + 1;
>  
>  	    l = endp;
>  	    if (l < re && *l == ',')
> @@ -188,28 +185,18 @@ __get_nprocs (void)
>  	while (l < re && *l != '\n');
>  
>        __close_nocancel_nostatus (fd);
> -
> -      if (result > 0)
> -	return result;
>      }
>  
> -  return get_nproc_stat (buffer, buffer_size);
> +  return result;
>  }
> -libc_hidden_def (__get_nprocs)
> -weak_alias (__get_nprocs, get_nprocs)
> -
>  
> -/* On some architectures it is possible to distinguish between configured
> -   and active cpus.  */
> -int
> -__get_nprocs_conf (void)
> +static int
> +get_nprocs_cpu (void)
>  {
> -  /* Try to use the sysfs filesystem.  It has actual information about
> -     online processors.  */
> +  int count = 0;
>    DIR *dir = __opendir ("/sys/devices/system/cpu");
>    if (dir != NULL)
>      {
> -      int count = 0;
>        struct dirent64 *d;
>  
>        while ((d = __readdir64 (dir)) != NULL)
> @@ -224,12 +211,57 @@ __get_nprocs_conf (void)
>  
>        __closedir (dir);
>  
> -      return count;
>      }
> +  return count;
> +}
>  
> -  enum { buffer_size = 1024 };
> -  char buffer[buffer_size];
> -  return get_nproc_stat (buffer, buffer_size);
> +static int
> +get_nprocs_fallback (void)
> +{
> +  int result;
> +
> +  /* Try /proc/stat first.  */
> +  result = get_nproc_stat ();
> +  if (result)

No implicit check.

> +    return result;
> +
> +  /* Try sched_getaffinity.  */
> +  result = __get_nprocs_sched ();
> +  if (result)

Ditto.

> +    return result;
> +
> +  /* We failed to obtain an accurate number.  Be conservative: return
> +     the smallest number meaning that this is not a uniprocessor system,
> +     so atomics are needed.  */
> +  return 2;
> +}
> +
> +int
> +__get_nprocs (void)
> +{
> +  /* Try /sys/devices/system/cpu/online first.  */
> +  int result = get_nprocs_cpu_online ();
> +  if (result)

Ditto.

> +    return result;
> +
> +  /* Fall back to /proc/stat and sched_getaffinity.  */
> +  return get_nprocs_fallback ();
> +}
> +libc_hidden_def (__get_nprocs)
> +weak_alias (__get_nprocs, get_nprocs)
> +
> +/* On some architectures it is possible to distinguish between configured
> +   and active cpus.  */
> +int
> +__get_nprocs_conf (void)
> +{
> +  /* Try /sys/devices/system/cpu/ first.  */
> +  int result = get_nprocs_cpu ();
> +  if (result)

Ditto.

> +    return result;
> +
> +  /* Fall back to /proc/stat and sched_getaffinity.  */
> +  return get_nprocs_fallback ();
>  }
>  libc_hidden_def (__get_nprocs_conf)
>  weak_alias (__get_nprocs_conf, get_nprocs_conf)
>
Dmitry V. Levin Feb. 8, 2022, 10:40 p.m. UTC | #2
On Tue, Feb 08, 2022 at 04:34:42PM -0300, Adhemerval Zanella wrote:
> On 07/02/2022 10:57, Dmitry V. Levin wrote:
> > get_nprocs() and get_nprocs_conf() use various methods to obtain an
> > accurate number of processors.  Re-introduce __get_nprocs_sched() as
> > a source of information, and fix the order in which these methods are
> > used to return the most accurate information.  The primary source of
> > information used in both functions remains unchanged.
> > 
> > This also changes __get_nprocs_sched() error return value from 2 to 0,
> > but all its users are already prepared to handle that.
> > 
> > Old behavior:
> >   get_nprocs:
> >     /sys/devices/system/cpu/online -> /proc/stat -> 2
> >   get_nprocs_conf:
> >     /sys/devices/system/cpu/ -> /proc/stat -> 2
> > 
> > New behavior:
> >   get_nprocs:
> >     /sys/devices/system/cpu/online -> /proc/stat -> sched_getaffinity -> 2
> >   get_nprocs_conf:
> >     /sys/devices/system/cpu/ -> /proc/stat -> sched_getaffinity -> 2
> > 
> > Fixes: 342298278e ("linux: Revert the use of sched_getaffinity on get_nproc")
> > Closes: BZ #28865
> 
> I am still not fully sure if sched_getaffinity is a correct fallback to an 
> API that should return a system overview, the issue that lead 
> sched_getaffinity removal was that it is subject to per-process filtering
> (either by seccomp, cgroup, etc.) and it might trigger some wrong behavior
> in some programs  (such as monitoring tools and jvms).
> 
> However if the environment does not provide a way to actually obtain such
> information I guess sched_getaffinity should not make things worse (it does
> not make sense to assume multiprocessor if the process is not allowed more
> than one CPU, and monitoring tools should not work if sysfs/procfs are
> not present).
> 
> LGTM with some nits below.  I think you might use brackets instead of
> square bracket in title (to trigger the bugzilla scripts).

I don't think it's the kind of brackets that trigger bugzilla scripts.

> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

Corrected nits and pushed, thanks.
Adhemerval Zanella Feb. 8, 2022, 10:58 p.m. UTC | #3
On 08/02/2022 19:40, Dmitry V. Levin wrote:
> On Tue, Feb 08, 2022 at 04:34:42PM -0300, Adhemerval Zanella wrote:
>> On 07/02/2022 10:57, Dmitry V. Levin wrote:
>>> get_nprocs() and get_nprocs_conf() use various methods to obtain an
>>> accurate number of processors.  Re-introduce __get_nprocs_sched() as
>>> a source of information, and fix the order in which these methods are
>>> used to return the most accurate information.  The primary source of
>>> information used in both functions remains unchanged.
>>>
>>> This also changes __get_nprocs_sched() error return value from 2 to 0,
>>> but all its users are already prepared to handle that.
>>>
>>> Old behavior:
>>>   get_nprocs:
>>>     /sys/devices/system/cpu/online -> /proc/stat -> 2
>>>   get_nprocs_conf:
>>>     /sys/devices/system/cpu/ -> /proc/stat -> 2
>>>
>>> New behavior:
>>>   get_nprocs:
>>>     /sys/devices/system/cpu/online -> /proc/stat -> sched_getaffinity -> 2
>>>   get_nprocs_conf:
>>>     /sys/devices/system/cpu/ -> /proc/stat -> sched_getaffinity -> 2
>>>
>>> Fixes: 342298278e ("linux: Revert the use of sched_getaffinity on get_nproc")
>>> Closes: BZ #28865
>>
>> I am still not fully sure if sched_getaffinity is a correct fallback to an 
>> API that should return a system overview, the issue that lead 
>> sched_getaffinity removal was that it is subject to per-process filtering
>> (either by seccomp, cgroup, etc.) and it might trigger some wrong behavior
>> in some programs  (such as monitoring tools and jvms).
>>
>> However if the environment does not provide a way to actually obtain such
>> information I guess sched_getaffinity should not make things worse (it does
>> not make sense to assume multiprocessor if the process is not allowed more
>> than one CPU, and monitoring tools should not work if sysfs/procfs are
>> not present).
>>
>> LGTM with some nits below.  I think you might use brackets instead of
>> square bracket in title (to trigger the bugzilla scripts).
> 
> I don't think it's the kind of brackets that trigger bugzilla scripts.
> 
>> Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
> 
> Corrected nits and pushed, thanks.

I forgot to ask you to hold on backports to see if anything breaks, but it
seems your were faster than me.
diff mbox series

Patch

diff --git a/sysdeps/unix/sysv/linux/getsysstats.c b/sysdeps/unix/sysv/linux/getsysstats.c
index c98c8ce3d4..ef77ed193c 100644
--- a/sysdeps/unix/sysv/linux/getsysstats.c
+++ b/sysdeps/unix/sysv/linux/getsysstats.c
@@ -50,9 +50,8 @@  __get_nprocs_sched (void)
        is an arbitrary values assuming such systems should be rare and there
        is no offline cpus.  */
     return max_num_cpus;
-  /* Some other error.  2 is conservative (not a uniprocessor system, so
-     atomics are needed). */
-  return 2;
+  /* Some other error.  */
+  return 0;
 }
 
 static char *
@@ -108,22 +107,19 @@  next_line (int fd, char *const buffer, char **cp, char **re,
 }
 
 static int
-get_nproc_stat (char *buffer, size_t buffer_size)
+get_nproc_stat (void)
 {
+  enum { buffer_size = 1024 };
+  char buffer[buffer_size];
   char *buffer_end = buffer + buffer_size;
   char *cp = buffer_end;
   char *re = buffer_end;
-
-  /* Default to an SMP system in case we cannot obtain an accurate
-     number.  */
-  int result = 2;
+  int result = 0;
 
   const int flags = O_RDONLY | O_CLOEXEC;
   int fd = __open_nocancel ("/proc/stat", flags);
   if (fd != -1)
     {
-      result = 0;
-
       char *l;
       while ((l = next_line (fd, buffer, &cp, &re, buffer_end)) != NULL)
 	/* The current format of /proc/stat has all the cpu* entries
@@ -139,8 +135,8 @@  get_nproc_stat (char *buffer, size_t buffer_size)
   return result;
 }
 
-int
-__get_nprocs (void)
+static int
+get_nprocs_cpu_online (void)
 {
   enum { buffer_size = 1024 };
   char buffer[buffer_size];
@@ -179,7 +175,8 @@  __get_nprocs (void)
 		  }
 	      }
 
-	    result += m - n + 1;
+	    if (m >= n)
+	      result += m - n + 1;
 
 	    l = endp;
 	    if (l < re && *l == ',')
@@ -188,28 +185,18 @@  __get_nprocs (void)
 	while (l < re && *l != '\n');
 
       __close_nocancel_nostatus (fd);
-
-      if (result > 0)
-	return result;
     }
 
-  return get_nproc_stat (buffer, buffer_size);
+  return result;
 }
-libc_hidden_def (__get_nprocs)
-weak_alias (__get_nprocs, get_nprocs)
-
 
-/* On some architectures it is possible to distinguish between configured
-   and active cpus.  */
-int
-__get_nprocs_conf (void)
+static int
+get_nprocs_cpu (void)
 {
-  /* Try to use the sysfs filesystem.  It has actual information about
-     online processors.  */
+  int count = 0;
   DIR *dir = __opendir ("/sys/devices/system/cpu");
   if (dir != NULL)
     {
-      int count = 0;
       struct dirent64 *d;
 
       while ((d = __readdir64 (dir)) != NULL)
@@ -224,12 +211,57 @@  __get_nprocs_conf (void)
 
       __closedir (dir);
 
-      return count;
     }
+  return count;
+}
 
-  enum { buffer_size = 1024 };
-  char buffer[buffer_size];
-  return get_nproc_stat (buffer, buffer_size);
+static int
+get_nprocs_fallback (void)
+{
+  int result;
+
+  /* Try /proc/stat first.  */
+  result = get_nproc_stat ();
+  if (result)
+    return result;
+
+  /* Try sched_getaffinity.  */
+  result = __get_nprocs_sched ();
+  if (result)
+    return result;
+
+  /* We failed to obtain an accurate number.  Be conservative: return
+     the smallest number meaning that this is not a uniprocessor system,
+     so atomics are needed.  */
+  return 2;
+}
+
+int
+__get_nprocs (void)
+{
+  /* Try /sys/devices/system/cpu/online first.  */
+  int result = get_nprocs_cpu_online ();
+  if (result)
+    return result;
+
+  /* Fall back to /proc/stat and sched_getaffinity.  */
+  return get_nprocs_fallback ();
+}
+libc_hidden_def (__get_nprocs)
+weak_alias (__get_nprocs, get_nprocs)
+
+/* On some architectures it is possible to distinguish between configured
+   and active cpus.  */
+int
+__get_nprocs_conf (void)
+{
+  /* Try /sys/devices/system/cpu/ first.  */
+  int result = get_nprocs_cpu ();
+  if (result)
+    return result;
+
+  /* Fall back to /proc/stat and sched_getaffinity.  */
+  return get_nprocs_fallback ();
 }
 libc_hidden_def (__get_nprocs_conf)
 weak_alias (__get_nprocs_conf, get_nprocs_conf)