diff mbox series

[RFC] pthread_create: Name stacks in /proc/<pid>/maps

Message ID 20230927203815.1843924-1-irogers@google.com
State New
Headers show
Series [RFC] pthread_create: Name stacks in /proc/<pid>/maps | expand

Commit Message

Ian Rogers Sept. 27, 2023, 8:38 p.m. UTC
Linux 4.5 removed thread stack annotations due to the complexity of
computing them:
https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a

This untested (other than building) RFC patch uses
PR_SET_VMA_ANON_NAME to name stacks again as part of thread
creation. PR_SET_VMA_ANON_NAME was added in Linux 5.17:
https://github.com/torvalds/linux/commit/9a10064f5625d5572c3626c1516e0bebc6c9fe9b

The patch is intended to create discussion and possibly serve as an
implementation. When I try to test it current I get unrelated
failures and so guidance would be appreciated.

The naming of stacks can be useful in situations like debugging and
profiling, for example, to differentiate stack from heap memory accesses.
---
 include/sys/prctl.h   |  5 +++++
 nptl/allocatestack.c  | 24 ++++++++++++++++++++++++
 nptl/pthread_create.c |  5 +++++
 3 files changed, 34 insertions(+)

Comments

Carlos O'Donell Oct. 13, 2023, 9:44 p.m. UTC | #1
On 9/27/23 16:38, Ian Rogers wrote:
> Linux 4.5 removed thread stack annotations due to the complexity of
> computing them:
> https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a
> 
> This untested (other than building) RFC patch uses
> PR_SET_VMA_ANON_NAME to name stacks again as part of thread
> creation. PR_SET_VMA_ANON_NAME was added in Linux 5.17:
> https://github.com/torvalds/linux/commit/9a10064f5625d5572c3626c1516e0bebc6c9fe9b
> 
> The patch is intended to create discussion and possibly serve as an
> implementation. When I try to test it current I get unrelated
> failures and so guidance would be appreciated.

This is a *fantastic* idea and I'm excited to see this used more globally in glibc
to provide better observability into the anonymous mappings, particularly for the
allocator's mapped arena heaps (could also be used for the early dl-minimal-malloc.c
allocator, and _dl_early_allocate for TLS).

Please raise your current unrelated failures (post to libc-alpha or libc-help) and
I'd be happy to walk through them.
 
> The naming of stacks can be useful in situations like debugging and
> profiling, for example, to differentiate stack from heap memory accesses.

Francesco Nigro and I were just talking about observability issues in the OpenJDK
JVM, and he was surprised when I pointed him at your patch :-)

At a high level I think the patch is going in the right direction and I support
the increased observability.

I want to nudge this in a couple of directions:

(a) Please make this a real patch and we'll start reviewing it :-)

(b) Add a test case.

- Something that starts a thread with pthread_create and observes the named
  anonymous region is correctly named (in some way).

Example: nptl/tst-setgetname.c (but using #include <support/test-driver.c>)

(b) Put the feature behind a tunable.

- Add a glibc.pthread.stack_names tunable of type INT_32 that defaults
  to 1 for "name stack using a prctl" and 0 for "don't name them."
  This is me being conservative and allowing deployments to turn this
  off if they observe performance consequences. Eventually we may get rid
  of this if nobody needs it (we can deprecate and remove tunables
  at major release boundaries).

Example: commit b630be0922dbaaa50eb174a7740f0d3fb88602da

(c) Name the runtime.

- Add a "glibc:" prefix to the names to identify the responsible runtime.
  This makes it clear these are created by and identified as anonymous
  mappings the *runtime* has created for the user.

Questions:

(d) Should we distinguish between user supplied stacks and system supplied
    stacks e.g. pd->stack_user == true "user: stack {tid}"?

(e) Should we distinguish when a stack goes into the cache? A dying thread
    that uses a system provided stack could mark the stack as "glibc: stack cache"
    since the stack is stored in the local cache.

> ---
>  include/sys/prctl.h   |  5 +++++
>  nptl/allocatestack.c  | 24 ++++++++++++++++++++++++
>  nptl/pthread_create.c |  5 +++++
>  3 files changed, 34 insertions(+)
> 
> diff --git a/include/sys/prctl.h b/include/sys/prctl.h
> index d33f3a290e..8968a632d3 100644
> --- a/include/sys/prctl.h
> +++ b/include/sys/prctl.h
> @@ -3,6 +3,11 @@
>  
>  # ifndef _ISOMAC
>  
> +#  ifndef PR_SET_VMA
> +#   define PR_SET_VMA          0x53564d41
> +#   define PR_SET_VMA_ANON_NAME 0
> +#  endif

OK. Matches kernel.

include/uapi/linux/prctl.h:#define PR_SET_VMA		0x53564d41
include/uapi/linux/prctl.h:# define PR_SET_VMA_ANON_NAME		0

> +
>  extern int __prctl (int __option, ...);
>  libc_hidden_proto (__prctl)
>  
> diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
> index f9d8cdfd08..404a222c24 100644
> --- a/nptl/allocatestack.c
> +++ b/nptl/allocatestack.c
> @@ -23,6 +23,7 @@
>  #include <unistd.h>
>  #include <sys/mman.h>
>  #include <sys/param.h>
> +#include <sys/prctl.h>

OK.

>  #include <dl-sysdep.h>
>  #include <dl-tls.h>
>  #include <tls.h>
> @@ -33,6 +34,7 @@
>  #include <nptl-stack.h>
>  #include <libc-lock.h>
>  #include <tls-internal.h>
> +#include "intprops.h"

Should be '#include <intprops.h>'

I expect it works because of the -I's, but if you copied this from
somewhere else, then we should fix it everywhere.

grep -r '#include.*"intprops.h"'
support/timespec-sub.c:#include "intprops.h"
support/timespec-add.c:#include "intprops.h"

Something to fix for another day.

>  
>  /* Default alignment of stack.  */
>  #ifndef STACK_ALIGN
> @@ -577,3 +579,25 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
>  
>    return 0;
>  }
> +
> +/* Use PR_SET_VMA_ANON_NAME to name the thread's stack stack:<tid> */
> +static void
> +name_stack_maps (struct pthread *pd, bool set)
> +{
> +  void *stack = THREAD_GETMEM (pd, stackblock);
> +  size_t stacksize = THREAD_GETMEM (pd, stackblock_size);
> +
> +  if (!set)
> +    __prctl (PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
> +  else
> +    {
> +#define STACK_MAPS_PREFIX "stack:"

As suggested "glibc: stack "

> +      char stack_name[sizeof (STACK_MAPS_PREFIX) +
> +                      INT_BUFSIZE_BOUND (unsigned int)];

OK. Good use of INT_BUFSIZE_BOUND as in assert.

> +
> +      __snprintf (stack_name, sizeof (stack_name), STACK_MAPS_PREFIX "%u",
> +                  (unsigned int) THREAD_GETMEM (pd, tid));

OK. Yes, we have tid cached in pd.

> +      __prctl (PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, stack_name);

OK. We don't care if this fails.

> +#undef STACK_MAPS_PREFIX
> +    }
> +}
> diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
> index 6a41d50109..7249513a80 100644
> --- a/nptl/pthread_create.c
> +++ b/nptl/pthread_create.c
> @@ -369,6 +369,9 @@ start_thread (void *arg)
>    /* Initialize pointers to locale data.  */
>    __ctype_init ();
>  
> +  /* Set up /proc/<pid>/maps entry for stack to stack:tid. */
> +  name_stack_maps (pd, true);

OK.

> +
>    /* Register rseq TLS to the kernel.  */
>    {
>      bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
> @@ -571,6 +574,8 @@ start_thread (void *arg)
>      /* Free the TCB.  */
>      __nptl_free_tcb (pd);
>  
> +  /* Release name for /proc/<pid>/maps. */
> +  name_stack_maps (pd, false);

OK.

>  out:
>    /* We cannot call '_exit' here.  '_exit' will terminate the process.
>
Siddhesh Poyarekar Oct. 16, 2023, 2:16 p.m. UTC | #2
On 2023-10-13 17:44, Carlos O'Donell wrote:
> On 9/27/23 16:38, Ian Rogers wrote:
>> Linux 4.5 removed thread stack annotations due to the complexity of
>> computing them:
>> https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a
>>
>> This untested (other than building) RFC patch uses
>> PR_SET_VMA_ANON_NAME to name stacks again as part of thread
>> creation. PR_SET_VMA_ANON_NAME was added in Linux 5.17:
>> https://github.com/torvalds/linux/commit/9a10064f5625d5572c3626c1516e0bebc6c9fe9b
>>
>> The patch is intended to create discussion and possibly serve as an
>> implementation. When I try to test it current I get unrelated
>> failures and so guidance would be appreciated.
> 
> This is a *fantastic* idea and I'm excited to see this used more globally in glibc
> to provide better observability into the anonymous mappings, particularly for the
> allocator's mapped arena heaps (could also be used for the early dl-minimal-malloc.c
> allocator, and _dl_early_allocate for TLS).
> 
> Please raise your current unrelated failures (post to libc-alpha or libc-help) and
> I'd be happy to walk through them.
>   
>> The naming of stacks can be useful in situations like debugging and
>> profiling, for example, to differentiate stack from heap memory accesses.
> 
> Francesco Nigro and I were just talking about observability issues in the OpenJDK
> JVM, and he was surprised when I pointed him at your patch :-)
> 
> At a high level I think the patch is going in the right direction and I support
> the increased observability.
> 
> I want to nudge this in a couple of directions:
> 
> (a) Please make this a real patch and we'll start reviewing it :-)

One of the things you'd need to do is, e.g. see other examples of how 
linux kernel version checks are done to determine if the prctl is 
available.  Also the Linux-specific code would need to go into 
sysdeps/unix/sysv/linux with NOP overrides in sysdeps/generic so that 
there's no Linux-specific code in the default implementation.

> 
> (b) Add a test case.
> 
> - Something that starts a thread with pthread_create and observes the named
>    anonymous region is correctly named (in some way).
> 
> Example: nptl/tst-setgetname.c (but using #include <support/test-driver.c>)
> 
> (b) Put the feature behind a tunable.
> 
> - Add a glibc.pthread.stack_names tunable of type INT_32 that defaults
>    to 1 for "name stack using a prctl" and 0 for "don't name them."
>    This is me being conservative and allowing deployments to turn this
>    off if they observe performance consequences. Eventually we may get rid
>    of this if nobody needs it (we can deprecate and remove tunables
>    at major release boundaries).
> 
> Example: commit b630be0922dbaaa50eb174a7740f0d3fb88602da

The performance issue in the earlier version of stack annotations that I 
had implemented was that it had quadratic behaviour; the code walked 
through maps of all threads to identify thread stacks.  This on the 
other hand is pretty straightforward and does not involve any iteration, 
so maybe there's no need for this?

> (c) Name the runtime.
> 
> - Add a "glibc:" prefix to the names to identify the responsible runtime.
>    This makes it clear these are created by and identified as anonymous
>    mappings the *runtime* has created for the user.
> 
> Questions:
> 
> (d) Should we distinguish between user supplied stacks and system supplied
>      stacks e.g. pd->stack_user == true "user: stack {tid}"?

I'm not too sure about this because it opens the door for feature creep 
and there's only 79 chars to work with.  It's not a hard objection though.

> (e) Should we distinguish when a stack goes into the cache? A dying thread
>      that uses a system provided stack could mark the stack as "glibc: stack cache"
>      since the stack is stored in the local cache.

It shouldn't be called stack anymore.

FWIW though, this is going to be static information and may get 
invalidated with, e.g. ucontext stack switching or any other custom code 
that switches stacks.  The necessity of VMA walking in my iteration of 
this feature stemmed from that requirement.  It's still a useful enough 
feature IMO, but that fact should probably get recorded in the git commit.

Thanks,
Sid
Adhemerval Zanella Oct. 17, 2023, 3:35 p.m. UTC | #3
On 16/10/23 11:16, Siddhesh Poyarekar wrote:
> On 2023-10-13 17:44, Carlos O'Donell wrote:
>> On 9/27/23 16:38, Ian Rogers wrote:
>>> Linux 4.5 removed thread stack annotations due to the complexity of
>>> computing them:
>>> https://github.com/torvalds/linux/commit/65376df582174ffcec9e6471bf5b0dd79ba05e4a
>>>
>>> This untested (other than building) RFC patch uses
>>> PR_SET_VMA_ANON_NAME to name stacks again as part of thread
>>> creation. PR_SET_VMA_ANON_NAME was added in Linux 5.17:
>>> https://github.com/torvalds/linux/commit/9a10064f5625d5572c3626c1516e0bebc6c9fe9b
>>>
>>> The patch is intended to create discussion and possibly serve as an
>>> implementation. When I try to test it current I get unrelated
>>> failures and so guidance would be appreciated.
>>
>> This is a *fantastic* idea and I'm excited to see this used more globally in glibc
>> to provide better observability into the anonymous mappings, particularly for the
>> allocator's mapped arena heaps (could also be used for the early dl-minimal-malloc.c
>> allocator, and _dl_early_allocate for TLS).
>>
>> Please raise your current unrelated failures (post to libc-alpha or libc-help) and
>> I'd be happy to walk through them.
>>  
>>> The naming of stacks can be useful in situations like debugging and
>>> profiling, for example, to differentiate stack from heap memory accesses.
>>
>> Francesco Nigro and I were just talking about observability issues in the OpenJDK
>> JVM, and he was surprised when I pointed him at your patch :-)
>>
>> At a high level I think the patch is going in the right direction and I support
>> the increased observability.
>>
>> I want to nudge this in a couple of directions:
>>
>> (a) Please make this a real patch and we'll start reviewing it :-)
> 
> One of the things you'd need to do is, e.g. see other examples of how linux kernel version checks are done to determine if the prctl is available.  Also the Linux-specific code would need to go into sysdeps/unix/sysv/linux with NOP overrides in sysdeps/generic so that there's no Linux-specific code in the default implementation.

In this case, being a complete opt-in feature the code can just ignore if the
kernel does support the prctl.  Also, PR_SET_VMA_ANON_NAME returns EINVAL for 
invalid name (without a null terminator), for sizes not being pagesize aligned, 
for invalid size, and/or for overflow; so we need to check for invalid input
before check for flag support:

  bool
  __set_vma_name (void *start, size_t len_in, const char *name)
  {
    size_t page_mask = ~(GLRO(dl_pagesize) - 1);

    uintptr_t vma = (uintptr_t) start;
    size_t len = (len_in + ~page_mask) & page_mask;
    size_t end = start + len;

    /* Check the arguments, so we can distinguish between invalid input
       and unsupported option.  */
    if ((vma & GLRO(dl_pagesize))
        || (len_in && !len)
        || (start + len))
      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
   
    static int prcrl_supported = 1;
    if (atomic_load_relaxed (&getdents64_supported) == 1)
      {
        int r = INLINE_SYSCALL_CALL (PR_SET_VMA, PR_SET_VMA_ANON_NAME,
                                     start, len_in, name);
        /* Invalid 
        if (ret != 0 || errno != EINVAL)
	  return r;

        atomic_store_relaxed (&prcrl_supported, 0);
      }

    return -1;
  }

Also, nptl is already Linux specific so there is no need to add extra abstractions.

> 
>>
>> (b) Add a test case.
>>
>> - Something that starts a thread with pthread_create and observes the named
>>    anonymous region is correctly named (in some way).
>>
>> Example: nptl/tst-setgetname.c (but using #include <support/test-driver.c>)
>>
>> (b) Put the feature behind a tunable.
>>
>> - Add a glibc.pthread.stack_names tunable of type INT_32 that defaults
>>    to 1 for "name stack using a prctl" and 0 for "don't name them."
>>    This is me being conservative and allowing deployments to turn this
>>    off if they observe performance consequences. Eventually we may get rid
>>    of this if nobody needs it (we can deprecate and remove tunables
>>    at major release boundaries).
>>
>> Example: commit b630be0922dbaaa50eb174a7740f0d3fb88602da
> 
> The performance issue in the earlier version of stack annotations that I had implemented was that it had quadratic behaviour; the code walked through maps of all threads to identify thread stacks.  This on the other hand is pretty straightforward and does not involve any iteration, so maybe there's no need for this?

The issue is the prctl requires the mmap lock, and it might add extra
contention depending on thread creating creation rate and lifetime
(the lock contention might eventually be fixed [1], but it is on
discussion afaik). 

That's why I suggested adding the tunable, just to have a way to disable
if it becomes an issue.

> 
>> (c) Name the runtime.
>>
>> - Add a "glibc:" prefix to the names to identify the responsible runtime.
>>    This makes it clear these are created by and identified as anonymous
>>    mappings the *runtime* has created for the user.
>>
>> Questions:
>>
>> (d) Should we distinguish between user supplied stacks and system supplied
>>      stacks e.g. pd->stack_user == true "user: stack {tid}"?
> 
> I'm not too sure about this because it opens the door for feature creep and there's only 79 chars to work with.  It's not a hard objection though.

The name is defined and set by libc.so, so I think we add different names for
user created stacks.

> 
>> (e) Should we distinguish when a stack goes into the cache? A dying thread
>>      that uses a system provided stack could mark the stack as "glibc: stack cache"
>>      since the stack is stored in the local cache.
> 
> It shouldn't be called stack anymore.
> 
> FWIW though, this is going to be static information and may get invalidated with, e.g. ucontext stack switching or any other custom code that switches stacks.  The necessity of VMA walking in my iteration of this feature stemmed from that requirement.  It's still a useful enough feature IMO, but that fact should probably get recorded in the git commit.

I think this is out of scope to handle ucontext stack switching.

[1] https://lwn.net/Articles/906852/
diff mbox series

Patch

diff --git a/include/sys/prctl.h b/include/sys/prctl.h
index d33f3a290e..8968a632d3 100644
--- a/include/sys/prctl.h
+++ b/include/sys/prctl.h
@@ -3,6 +3,11 @@ 
 
 # ifndef _ISOMAC
 
+#  ifndef PR_SET_VMA
+#   define PR_SET_VMA          0x53564d41
+#   define PR_SET_VMA_ANON_NAME 0
+#  endif
+
 extern int __prctl (int __option, ...);
 libc_hidden_proto (__prctl)
 
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index f9d8cdfd08..404a222c24 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -23,6 +23,7 @@ 
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/param.h>
+#include <sys/prctl.h>
 #include <dl-sysdep.h>
 #include <dl-tls.h>
 #include <tls.h>
@@ -33,6 +34,7 @@ 
 #include <nptl-stack.h>
 #include <libc-lock.h>
 #include <tls-internal.h>
+#include "intprops.h"
 
 /* Default alignment of stack.  */
 #ifndef STACK_ALIGN
@@ -577,3 +579,25 @@  allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 
   return 0;
 }
+
+/* Use PR_SET_VMA_ANON_NAME to name the thread's stack stack:<tid> */
+static void
+name_stack_maps (struct pthread *pd, bool set)
+{
+  void *stack = THREAD_GETMEM (pd, stackblock);
+  size_t stacksize = THREAD_GETMEM (pd, stackblock_size);
+
+  if (!set)
+    __prctl (PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, NULL);
+  else
+    {
+#define STACK_MAPS_PREFIX "stack:"
+      char stack_name[sizeof (STACK_MAPS_PREFIX) +
+                      INT_BUFSIZE_BOUND (unsigned int)];
+
+      __snprintf (stack_name, sizeof (stack_name), STACK_MAPS_PREFIX "%u",
+                  (unsigned int) THREAD_GETMEM (pd, tid));
+      __prctl (PR_SET_VMA, PR_SET_VMA_ANON_NAME, stack, stacksize, stack_name);
+#undef STACK_MAPS_PREFIX
+    }
+}
diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 6a41d50109..7249513a80 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -369,6 +369,9 @@  start_thread (void *arg)
   /* Initialize pointers to locale data.  */
   __ctype_init ();
 
+  /* Set up /proc/<pid>/maps entry for stack to stack:tid. */
+  name_stack_maps (pd, true);
+
   /* Register rseq TLS to the kernel.  */
   {
     bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
@@ -571,6 +574,8 @@  start_thread (void *arg)
     /* Free the TCB.  */
     __nptl_free_tcb (pd);
 
+  /* Release name for /proc/<pid>/maps. */
+  name_stack_maps (pd, false);
 out:
   /* We cannot call '_exit' here.  '_exit' will terminate the process.