diff mbox series

[RFC,v11,15/17] nptl: Move the rseq area to the 'extra TLS' block

Message ID 20240801-rseq-abi-v11-split-v11-15-b7bbc0138bfd@efficios.com
State New
Headers show
Series Add rseq extensible ABI support | expand

Commit Message

Michael Jeanson Aug. 2, 2024, 7:02 p.m. UTC
Move the rseq area to the newly added 'extra TLS' block, this is the
last step in adding support for the rseq extended ABI. The size of the
rseq area is now dynamic and depends on the rseq features reported by
the kernel through the elf auxiliary vector. This will allow
applications to use rseq features past the 32 bytes of the original rseq
ABI as they become available in future kernels.

Signed-off-by: Michael Jeanson <mjeanson@efficios.com>
---
 nptl/pthread_create.c                             |  2 +-
 sysdeps/nptl/dl-tls_init_tp.c                     | 20 +++---
 sysdeps/unix/sysv/linux/Makefile                  | 10 +++
 sysdeps/unix/sysv/linux/rseq-internal.h           | 80 ++++++++++++++++------
 sysdeps/unix/sysv/linux/sched_getcpu.c            |  3 +-
 sysdeps/unix/sysv/linux/tst-rseq-disable-static.c |  1 +
 sysdeps/unix/sysv/linux/tst-rseq-disable.c        | 60 ++++++++++++++---
 sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c    |  1 +
 sysdeps/unix/sysv/linux/tst-rseq-static.c         |  1 +
 sysdeps/unix/sysv/linux/tst-rseq.c                | 81 ++++++++++++++++++-----
 sysdeps/unix/sysv/linux/tst-rseq.h                |  7 +-
 11 files changed, 209 insertions(+), 57 deletions(-)
diff mbox series

Patch

diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c
index 1d3665d5ed..9b49ee7121 100644
--- a/nptl/pthread_create.c
+++ b/nptl/pthread_create.c
@@ -691,7 +691,7 @@  __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
 
   /* Inherit rseq registration state.  Without seccomp filters, rseq
      registration will either always fail or always succeed.  */
-  if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0)
+  if ((int) RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id) >= 0)
     pd->flags |= ATTR_FLAG_DO_RSEQ;
 
   /* Initialize the field for the ID of the thread which is waiting
diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
index 7803e19fd1..2869a0647d 100644
--- a/sysdeps/nptl/dl-tls_init_tp.c
+++ b/sysdeps/nptl/dl-tls_init_tp.c
@@ -99,19 +99,19 @@  __tls_init_tp (void)
   }
 
   {
+    /* If the registration fails or is disabled by tunable, the public rseq
+       size will be '0' regardless of the feature size of the allocated rseq
+       area.  An rseq area of at least 32 bytes is always allocated since
+       application code is allowed to test the status of the rseq registration
+       with 'rseq->cpu_id >= 0'.  */
     bool do_rseq = true;
     do_rseq = TUNABLE_GET (rseq, int, NULL);
     if (rseq_register_current_thread (pd, do_rseq))
-      _rseq_size = RSEQ_AREA_SIZE_INITIAL_USED;
-
-#ifdef RSEQ_SIG
-    /* This should be a compile-time constant, but the current
-       infrastructure makes it difficult to determine its value.  Not
-       all targets support __thread_pointer, so set __rseq_offset only
-       if the rseq registration may have happened because RSEQ_SIG is
-       defined.  */
-    _rseq_offset = (char *) &pd->rseq_area - (char *) __thread_pointer ();
-#endif
+      {
+	/* On successful registration, expose the feature size in the public
+	   '__rseq_size' symbol.  */
+        _rseq_size = GLRO (dl_rseq_feature_size);
+      }
   }
 
   /* Set initial thread's stack block from 0 up to __libc_stack_end.
diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index 59998c7af4..a6c142d363 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -263,6 +263,11 @@  tests-internal += \
   tst-rseq-disable \
   # tests-internal
 
+tests-static += \
+  tst-rseq-disable-static \
+  tst-rseq-static \
+  # tests-static
+
 tests-time64 += \
   tst-adjtimex-time64 \
   tst-clock_adjtime-time64 \
@@ -396,6 +401,7 @@  $(objpfx)tst-mount-compile.out: ../sysdeps/unix/sysv/linux/tst-mount-compile.py
 $(objpfx)tst-mount-compile.out: $(sysdeps-linux-python-deps)
 
 tst-rseq-disable-ENV = GLIBC_TUNABLES=glibc.pthread.rseq=0
+tst-rseq-disable-static-ENV = GLIBC_TUNABLES=glibc.pthread.rseq=0
 
 endif # $(subdir) == misc
 
@@ -661,4 +667,8 @@  tests += \
 tests-internal += \
   tst-rseq-nptl \
   # tests-internal
+
+tests-static += \
+  tst-rseq-nptl-static \
+  # tests-static
 endif
diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
index 2c68b8c70c..ec28d26de0 100644
--- a/sysdeps/unix/sysv/linux/rseq-internal.h
+++ b/sysdeps/unix/sysv/linux/rseq-internal.h
@@ -24,11 +24,26 @@ 
 #include <stdbool.h>
 #include <stdio.h>
 #include <sys/rseq.h>
+#include <thread_pointer.h>
+#include <ldsodefs.h>
 
-/* 32 is the initially required value for the area size.  The
-   actually used rseq size may be less (20 bytes initially).  */
-#define RSEQ_AREA_SIZE_INITIAL 32
-#define RSEQ_AREA_SIZE_INITIAL_USED 20
+/* rseq area registered with the kernel.  Use a custom definition here to
+   isolate from the system provided header which could lack some fields of the
+   Extended ABI.
+
+   Access to fields of the Extended ABI beyond the 20 bytes of the original ABI
+   (after 'flags') must be gated by a check of the feature size.  */
+struct rseq_area
+{
+  /* Original ABI.  */
+  uint32_t cpu_id_start;
+  uint32_t cpu_id;
+  uint64_t rseq_cs;
+  uint32_t flags;
+  /* Extended ABI.  */
+  uint32_t node_id;
+  uint32_t mm_cid;
+};
 
 /* Minimum size of the rseq area.  */
 #define RSEQ_AREA_MIN_SIZE 32
@@ -39,10 +54,28 @@ 
 /* Minimum alignment of the rseq area.  */
 #define RSEQ_MIN_ALIGN 32
 
-/* The variables are in .data.relro but are not yet write-protected.  */
+/* Size of the active features in the rseq area of the current registration, 0
+   if registration failed.
+   In .data.relro but not yet write-protected.  */
 extern unsigned int _rseq_size attribute_hidden;
+
+/* Offset from the thread pointer to the rseq area, always set to allow
+   checking the registration status by reading the 'cpu_id' field.
+   In .data.relro but not yet write-protected.  */
 extern ptrdiff_t _rseq_offset attribute_hidden;
 
+/* Returns a pointer to the current thread rseq area.  */
+static inline struct rseq_area *
+rseq_get_area(void)
+{
+#if IS_IN (rtld)
+  /* Use the hidden symbol in ld.so.  */
+  return (struct rseq_area *) ((char *) __thread_pointer() + _rseq_offset);
+#else
+  return (struct rseq_area *) ((char *) __thread_pointer() + __rseq_offset);
+#endif
+}
+
 #ifdef RSEQ_SIG
 static inline bool
 rseq_register_current_thread (struct pthread *self, bool do_rseq)
@@ -50,29 +83,38 @@  rseq_register_current_thread (struct pthread *self, bool do_rseq)
   if (do_rseq)
     {
       unsigned int size;
-#if IS_IN (rtld)
-      /* Use the hidden symbol in ld.so.  */
-      size = _rseq_size;
-#else
-      size = __rseq_size;
-#endif
-      if (size < RSEQ_AREA_SIZE_INITIAL)
-        /* The initial implementation used only 20 bytes out of 32,
-           but still expected size 32.  */
-        size = RSEQ_AREA_SIZE_INITIAL;
-      int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area,
-                                       size, 0, RSEQ_SIG);
+
+      /* Get the feature size from the auxiliary vector, this will always be at
+         least 20 bytes.  */
+      size = GLRO (dl_rseq_feature_size);
+
+      /* The feature size can be smaller than the minimum rseq area size of 32
+         bytes that the syscall will accept, if this is the case, bump the size
+         to the minimum of 32 bytes. */
+      if (size < RSEQ_AREA_MIN_SIZE)
+        size = RSEQ_AREA_MIN_SIZE;
+
+      /* The kernel expects 'rseq_area->rseq_cs == NULL' on registration, zero
+         the whole rseq area.  */
+      memset(rseq_get_area(), 0, size);
+
+      int ret = INTERNAL_SYSCALL_CALL (rseq, rseq_get_area(), size, 0,
+		      RSEQ_SIG);
       if (!INTERNAL_SYSCALL_ERROR_P (ret))
         return true;
     }
-  THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
+
+  /* If the registration failed or is disabled by tunable, we have to set 'cpu_id' to
+     RSEQ_CPU_ID_REGISTRATION_FAILED to allow userspace to properly test the
+     status of the registration.  */
+  RSEQ_SETMEM (rseq_get_area(), cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
   return false;
 }
 #else /* RSEQ_SIG */
 static inline bool
 rseq_register_current_thread (struct pthread *self, bool do_rseq)
 {
-  THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
+  RSEQ_SETMEM (rseq_get_area(), cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
   return false;
 }
 #endif /* RSEQ_SIG */
diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c b/sysdeps/unix/sysv/linux/sched_getcpu.c
index 72a3360550..3cdf854316 100644
--- a/sysdeps/unix/sysv/linux/sched_getcpu.c
+++ b/sysdeps/unix/sysv/linux/sched_getcpu.c
@@ -19,6 +19,7 @@ 
 #include <sched.h>
 #include <sysdep.h>
 #include <sysdep-vdso.h>
+#include <rseq-internal.h>
 
 static int
 vsyscall_sched_getcpu (void)
@@ -36,6 +37,6 @@  vsyscall_sched_getcpu (void)
 int
 sched_getcpu (void)
 {
-  int cpu_id = THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id);
+  int cpu_id = RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id);
   return __glibc_likely (cpu_id >= 0) ? cpu_id : vsyscall_sched_getcpu ();
 }
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c b/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c
new file mode 100644
index 0000000000..2687d13d3d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-rseq-disable-static.c
@@ -0,0 +1 @@ 
+#include "tst-rseq-disable.c"
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-disable.c b/sysdeps/unix/sysv/linux/tst-rseq-disable.c
index bbc655bec4..b1f4e894f1 100644
--- a/sysdeps/unix/sysv/linux/tst-rseq-disable.c
+++ b/sysdeps/unix/sysv/linux/tst-rseq-disable.c
@@ -26,32 +26,65 @@ 
 #include <unistd.h>
 
 #ifdef RSEQ_SIG
+# include <sys/auxv.h>
+# include "tst-rseq.h"
+
+static __thread struct rseq local_rseq = {
+  .cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED,
+};
 
 /* Check that rseq can be registered and has not been taken by glibc.  */
 static void
 check_rseq_disabled (void)
 {
-  struct pthread *pd = THREAD_SELF;
+  struct rseq *rseq_area = (struct rseq *) ((char *) __thread_pointer () + __rseq_offset);
+
+#if TLS_TCB_AT_TP
+  /* The rseq area block should come before the thread pointer and be at least 32 bytes. */
+  TEST_VERIFY (__rseq_offset <= RSEQ_TEST_MIN_SIZE);
+#elif TLS_DTV_AT_TP
+  /* The rseq area block should come after the thread pointer. */
+  TEST_VERIFY (__rseq_offset >= 0);
+#else
+# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
+#endif
 
+  /* __rseq_flags is unused and should always be '0'.  */
   TEST_COMPARE (__rseq_flags, 0);
-  TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset
-               == (char *) &pd->rseq_area);
+
+  /* When rseq is not registered, __rseq_size should always be '0'.  */
   TEST_COMPARE (__rseq_size, 0);
-  TEST_COMPARE ((int) pd->rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
 
-  int ret = syscall (__NR_rseq, &pd->rseq_area, sizeof (pd->rseq_area),
-                     0, RSEQ_SIG);
+  /* When rseq is not registered, the 'cpu_id' field should be set to
+     RSEQ_CPU_ID_REGISTRATION_FAILED.  */
+  TEST_COMPARE ((int) rseq_area->cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED);
+
+  /* Test a rseq registration which should succeed since the internal
+     registration is disabled.  */
+  int ret = syscall (__NR_rseq, &local_rseq, RSEQ_TEST_MIN_SIZE, 0, RSEQ_SIG);
   if (ret == 0)
     {
-      ret = syscall (__NR_rseq, &pd->rseq_area, sizeof (pd->rseq_area),
+      /* A successful registration should set the cpu id.  */
+      TEST_VERIFY (local_rseq.cpu_id >= 0);
+
+      /* Test we can also unregister rseq.  */
+      ret = syscall (__NR_rseq, &local_rseq, RSEQ_TEST_MIN_SIZE,
                      RSEQ_FLAG_UNREGISTER, RSEQ_SIG);
       TEST_COMPARE (ret, 0);
-      pd->rseq_area.cpu_id = RSEQ_CPU_ID_REGISTRATION_FAILED;
     }
   else
     {
-      TEST_VERIFY (errno != -EINVAL);
-      TEST_VERIFY (errno != -EBUSY);
+      /* Check if we failed with EINVAL which would mean an invalid rseq flags,
+         a mis-aligned rseq area address or an incorrect rseq size.  */
+      TEST_VERIFY (errno != EINVAL);
+
+      /* Check if we failed with EBUSY which means an existing rseq
+         registration. */
+      TEST_VERIFY (errno != EBUSY);
+
+      /* Check if we failed with EFAULT which means an invalid rseq area
+         address.  */
+      TEST_VERIFY (errno != EFAULT);
     }
 }
 
@@ -71,6 +104,13 @@  proc_func (void *ignored)
 static int
 do_test (void)
 {
+  printf ("info: __rseq_size: %u\n", __rseq_size);
+  printf ("info: __rseq_offset: %td\n", __rseq_offset);
+  printf ("info: __rseq_flags: %u\n", __rseq_flags);
+  printf ("info: getauxval (AT_RSEQ_FEATURE_SIZE): %ld\n",
+          getauxval (AT_RSEQ_FEATURE_SIZE));
+  printf ("info: getauxval (AT_RSEQ_ALIGN): %ld\n", getauxval (AT_RSEQ_ALIGN));
+
   puts ("info: checking main thread");
   check_rseq_disabled ();
 
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c b/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c
new file mode 100644
index 0000000000..6e2c923bb9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-rseq-nptl-static.c
@@ -0,0 +1 @@ 
+#include "tst-rseq-nptl.c"
diff --git a/sysdeps/unix/sysv/linux/tst-rseq-static.c b/sysdeps/unix/sysv/linux/tst-rseq-static.c
new file mode 100644
index 0000000000..1d97f3bd3d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/tst-rseq-static.c
@@ -0,0 +1 @@ 
+#include "tst-rseq.c"
diff --git a/sysdeps/unix/sysv/linux/tst-rseq.c b/sysdeps/unix/sysv/linux/tst-rseq.c
index 08a9533130..802672e840 100644
--- a/sysdeps/unix/sysv/linux/tst-rseq.c
+++ b/sysdeps/unix/sysv/linux/tst-rseq.c
@@ -19,6 +19,8 @@ 
    not linked against libpthread.  */
 
 #include <support/check.h>
+#include <support/namespace.h>
+#include <support/xthread.h>
 #include <stdio.h>
 #include <sys/rseq.h>
 #include <unistd.h>
@@ -32,23 +34,66 @@ 
 # include <sys/auxv.h>
 # include <thread_pointer.h>
 # include <tls.h>
+# include <sys/auxv.h>
 # include "tst-rseq.h"
 
 static void
 do_rseq_main_test (void)
 {
-  struct pthread *pd = THREAD_SELF;
+  size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), RSEQ_TEST_MIN_ALIGN);
+  size_t rseq_feature_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), RSEQ_TEST_MIN_FEATURE_SIZE);
+  size_t rseq_alloc_size = roundup (MAX (rseq_feature_size, RSEQ_TEST_MIN_SIZE), rseq_align);
+  struct rseq *global_rseq = __thread_pointer () + __rseq_offset;
 
   TEST_VERIFY_EXIT (rseq_thread_registered ());
+
+  /* __rseq_flags is unused and should always be '0'.  */
   TEST_COMPARE (__rseq_flags, 0);
-  TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset
-               == (char *) &pd->rseq_area);
-  /* The current implementation only supports the initial size.  */
-  TEST_COMPARE (__rseq_size, 20);
+
+  /* When rseq is registered, __rseq_size should report the feature size.  */
+  TEST_COMPARE (__rseq_size, rseq_feature_size);
+
+  /* When rseq is registered, the 'cpu_id' field should be set to a valid cpu
+   * number.  */
+  TEST_VERIFY ((int32_t) global_rseq->cpu_id >= 0);
+
+  /* The rseq area address must be aligned.  */
+  TEST_VERIFY (((unsigned long) global_rseq % rseq_align) == 0);
+
+#if TLS_TCB_AT_TP
+  /* The rseq area block should come before the thread pointer and be at least 32 bytes. */
+  TEST_VERIFY (__rseq_offset <= RSEQ_TEST_MIN_SIZE);
+#elif TLS_DTV_AT_TP
+  /* The rseq area block should come after the thread pointer. */
+  TEST_VERIFY (__rseq_offset >= 0);
+#else
+# error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
+#endif
+
+  /* Test a rseq registration with the same arguments as the internal
+     registration which should fail with errno == EBUSY.  */
+  TEST_VERIFY (((unsigned long) global_rseq % rseq_align) == 0);
+  TEST_VERIFY (__rseq_size <= rseq_alloc_size);
+  int ret = syscall (__NR_rseq, global_rseq, rseq_alloc_size, 0, RSEQ_SIG);
+  TEST_VERIFY (ret != 0);
+  TEST_COMPARE (errno, EBUSY);
+}
+
+static void *
+thread_func (void *ignored)
+{
+  do_rseq_main_test ();
+  return NULL;
 }
 
 static void
-do_rseq_test (void)
+proc_func (void *ignored)
+{
+  do_rseq_main_test ();
+}
+
+static int
+do_test (void)
 {
   if (!rseq_available ())
     {
@@ -60,21 +105,27 @@  do_rseq_test (void)
   printf ("info: getauxval (AT_RSEQ_FEATURE_SIZE): %ld\n",
           getauxval (AT_RSEQ_FEATURE_SIZE));
   printf ("info: getauxval (AT_RSEQ_ALIGN): %ld\n", getauxval (AT_RSEQ_ALIGN));
+
+  puts ("info: checking main thread");
+  do_rseq_main_test ();
+
+  puts ("info: checking main thread (2)");
   do_rseq_main_test ();
+
+  puts ("info: checking new thread");
+  xpthread_join (xpthread_create (NULL, thread_func, NULL));
+
+  puts ("info: checking subprocess");
+  support_isolate_in_subprocess (proc_func, NULL);
+
+  return 0;
 }
 #else /* RSEQ_SIG */
-static void
-do_rseq_test (void)
-{
-  FAIL_UNSUPPORTED ("glibc does not define RSEQ_SIG, skipping test");
-}
-#endif /* RSEQ_SIG */
-
 static int
 do_test (void)
 {
-  do_rseq_test ();
-  return 0;
+  FAIL_UNSUPPORTED ("glibc does not define RSEQ_SIG, skipping test");
 }
+#endif /* RSEQ_SIG */
 
 #include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/tst-rseq.h b/sysdeps/unix/sysv/linux/tst-rseq.h
index dc603327d3..7a2e19b07f 100644
--- a/sysdeps/unix/sysv/linux/tst-rseq.h
+++ b/sysdeps/unix/sysv/linux/tst-rseq.h
@@ -23,11 +23,16 @@ 
 #include <syscall.h>
 #include <sys/rseq.h>
 #include <tls.h>
+#include <rseq-internal.h>
+
+#define RSEQ_TEST_MIN_SIZE 32
+#define RSEQ_TEST_MIN_FEATURE_SIZE 20
+#define RSEQ_TEST_MIN_ALIGN 32
 
 static inline bool
 rseq_thread_registered (void)
 {
-  return THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id) >= 0;
+  return RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id) >= 0;
 }
 
 static inline int