Message ID | 20231123214021.286351-1-mjeanson@efficios.com |
---|---|
State | New |
Headers | show |
Series | [RFC,v2] Add rseq extensible ABI support | expand |
On 23/11/23 18:40, Michael Jeanson wrote: > Introduced in Linux v6.3 the rseq extensible ABI [1] will allow adding > rseq features past the initial 32 bytes of the original ABI. > > While the rseq features in the latest kernel still fit within the > original ABI size, there are currently only 4 bytes left. It would thus > be a good time to add support for the extensible ABI so that when new > features are added, they are immediately available to GNU libc users. > > This is a first draft of a possible implementation to gather some > feedback, it currently only supports dynamically linked executables. > > We use the ELF auxiliary vectors to query the kernel for the size and > alignment of the rseq area, if this fails we default to the original > fixed size and alignment of '32' which the kernel will accept as a > compatibility mode with the original ABI. > > This makes the size of the rseq area variable and thus requires to > relocate it out of 'struct pthread'. We chose to move it after (in block > allocation order) the last TLS block since it required a fairly small > modification to the TLS block allocator and did not interfere with the > main executable TLS block which must always be first. > > [1] https://lore.kernel.org/all/20221122203932.231377-4-mathieu.desnoyers@efficios.com/ > > Signed-off-by: Michael Jeanson <mjeanson@efficios.com> > Co-Authored-By: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> > Signed-off-By: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> > Cc: Florian Weimer <fweimer@redhat.com> > Cc: Carlos O'Donell <carlos@redhat.com> The arm32 regressions does seems related to this patch [1], the aarch64 ones are not clear [2]. [1] https://ci.linaro.org/job/tcwg_glibc_check--master-arm-precommit/1080/artifact/artifacts/artifacts.precommit/notify/mail-body.txt [2] https://ci.linaro.org/job/tcwg_glibc_check--master-aarch64-precommit/1056/artifact/artifacts/artifacts.precommit/notify/mail-body.txt > --- > Changes since RFC v1: > - Insert the rseq area after the last TLS block > - Add proper support for TLS_TCB_AT_TP variant > --- > csu/libc-tls.c | 6 +++ > elf/dl-tls.c | 56 +++++++++++++++++++++++++ > elf/rtld_static_init.c | 6 +++ > nptl/descr.h | 20 ++------- > nptl/pthread_create.c | 2 +- > sysdeps/generic/ldsodefs.h | 6 +++ > sysdeps/i386/nptl/tcb-access.h | 54 ++++++++++++++++++++++++ > sysdeps/nptl/dl-tls_init_tp.c | 4 +- > sysdeps/nptl/tcb-access.h | 5 +++ > sysdeps/unix/sysv/linux/rseq-internal.h | 26 ++++++++++-- > sysdeps/unix/sysv/linux/sched_getcpu.c | 3 +- > sysdeps/unix/sysv/linux/tst-rseq.c | 10 +++-- > sysdeps/unix/sysv/linux/tst-rseq.h | 3 +- > sysdeps/x86_64/nptl/tcb-access.h | 55 +++++++++++++++++++++++- > 14 files changed, 225 insertions(+), 31 deletions(-) > > diff --git a/csu/libc-tls.c b/csu/libc-tls.c > index cdf6442c02..8458f47577 100644 > --- a/csu/libc-tls.c > +++ b/csu/libc-tls.c > @@ -62,6 +62,12 @@ size_t _dl_tls_static_surplus; > dynamic TLS access (e.g. with TLSDESC). */ > size_t _dl_tls_static_optional; > > +/* Offset of the rseq area from the thread pointer. */ > +ptrdiff_t _dl_tls_rseq_offset; > + > +/* Size of the rseq area in the static TLS block. */ > +size_t _dl_tls_rseq_size; > + > /* Generation counter for the dtv. */ > size_t _dl_tls_generation; > > diff --git a/elf/dl-tls.c b/elf/dl-tls.c > index c192b5a13a..e9390f87fd 100644 > --- a/elf/dl-tls.c > +++ b/elf/dl-tls.c > @@ -24,6 +24,7 @@ > #include <unistd.h> > #include <sys/param.h> > #include <atomic.h> > +#include <sys/auxv.h> > > #include <tls.h> > #include <dl-tls.h> > @@ -75,6 +76,12 @@ > /* Default for dl_tls_static_optional. */ > #define OPTIONAL_TLS 512 > > +/* Minimum size of the rseq area. */ > +#define TLS_DL_RSEQ_MIN_SIZE 32 > + > +/* Minimum size of the rseq area alignment. */ > +#define TLS_DL_RSEQ_MIN_ALIGN 32 > + > /* Compute the static TLS surplus based on the namespace count and the > TLS space that can be used for optimizations. */ > static inline int > @@ -297,6 +304,29 @@ _dl_determine_tlsoffset (void) > slotinfo[cnt].map->l_tls_offset = off; > } > > + /* Insert the rseq area block after the last TLS block. */ > + > + /* Get the rseq auxiliary vectors, 0 is returned when not implemented > + and we then default to the rseq ABI minimums. */ > + size_t rseq_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), TLS_DL_RSEQ_MIN_SIZE); > + size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), TLS_DL_RSEQ_MIN_ALIGN); > + > + /* Make sure the rseq area size is a multiple of the requested > + aligment. */ > + rseq_size = roundup (rseq_size, rseq_align); > + > + /* Add the rseq area block to the global offset. */ > + offset = roundup (offset, rseq_align) + rseq_size; > + > + /* Increase the max_align if necessary. */ > + max_align = MAX (max_align, rseq_align); > + > + /* Record the rseq_area block size and offset. The offset is negative > + with TLS_TCB_AT_TP because the TLS blocks are located before the > + thread pointer. */ > + GLRO (dl_tls_rseq_offset) = -offset; > + GLRO (dl_tls_rseq_size) = rseq_size; > + > GL(dl_tls_static_used) = offset; > GLRO (dl_tls_static_size) = (roundup (offset + GLRO(dl_tls_static_surplus), > max_align) > @@ -342,6 +372,32 @@ _dl_determine_tlsoffset (void) > offset = off + slotinfo[cnt].map->l_tls_blocksize - firstbyte; > } > > + /* Insert the rseq area block after the last TLS block. */ > + > + /* Get the rseq auxiliary vectors, 0 is returned when not implemented > + and we then default to the rseq ABI minimums. */ > + size_t rseq_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), TLS_DL_RSEQ_MIN_SIZE); > + size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), TLS_DL_RSEQ_MIN_ALIGN); > + > + /* Make sure the rseq area size is a multiple of the requested > + aligment. */ > + rseq_size = roundup (rseq_size, rseq_align); > + > + /* Align the global offset to the beginning of the rseq area. */ > + offset = roundup (offset, rseq_align); > + > + /* Record the rseq_area block size and offset. The offset is positive > + with TLS_DTV_AT_TP because the TLS blocks are located after the > + thread pointer. */ > + GLRO (dl_tls_rseq_size) = rseq_size; > + GLRO (dl_tls_rseq_offset) = offset; > + > + /* Add the rseq area block to the global offset. */ > + offset += rseq_size; > + > + /* Increase the max_align if necessary. */ > + max_align = MAX (max_align, rseq_align); > + > GL(dl_tls_static_used) = offset; > GLRO (dl_tls_static_size) = roundup (offset + GLRO(dl_tls_static_surplus), > TCB_ALIGNMENT); > diff --git a/elf/rtld_static_init.c b/elf/rtld_static_init.c > index aec8cc056b..84548023ab 100644 > --- a/elf/rtld_static_init.c > +++ b/elf/rtld_static_init.c > @@ -78,6 +78,12 @@ __rtld_static_init (struct link_map *map) > extern __typeof (dl->_dl_tls_static_size) _dl_tls_static_size > attribute_hidden; > dl->_dl_tls_static_size = _dl_tls_static_size; > + extern __typeof (dl->_dl_tls_rseq_size) _dl_tls_rseq_size > + attribute_hidden; > + dl->_dl_tls_rseq_size = _dl_tls_rseq_size; > + extern __typeof (dl->_dl_tls_rseq_offset) _dl_tls_rseq_offset > + attribute_hidden; > + dl->_dl_tls_rseq_offset = _dl_tls_rseq_offset; > dl->_dl_find_object = _dl_find_object; > > __rtld_static_init_arch (map, dl); > diff --git a/nptl/descr.h b/nptl/descr.h > index 0171576c23..fadba1f619 100644 > --- a/nptl/descr.h > +++ b/nptl/descr.h > @@ -404,25 +404,11 @@ struct pthread > /* Used on strsignal. */ > struct tls_internal_t tls_state; > > - /* rseq area registered with the kernel. Use a custom definition > - here to isolate from kernel struct rseq changes. The > - implementation of sched_getcpu needs acccess to the cpu_id field; > - the other fields are unused and not included here. */ > - union > - { > - struct > - { > - uint32_t cpu_id_start; > - uint32_t cpu_id; > - }; > - char pad[32]; /* Original rseq area size. */ > - } rseq_area __attribute__ ((aligned (32))); > - > /* Amount of end padding, if any, in this structure. > - This definition relies on rseq_area being last. */ > + This definition relies on tls_state being last. */ > #define PTHREAD_STRUCT_END_PADDING \ > - (sizeof (struct pthread) - offsetof (struct pthread, rseq_area) \ > - + sizeof ((struct pthread) {}.rseq_area)) > + (sizeof (struct pthread) - offsetof (struct pthread, tls_state) \ > + + sizeof ((struct pthread) {}.tls_state)) > } __attribute ((aligned (TCB_ALIGNMENT))); > > static inline bool > diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c > index 63cb684f04..4ee0f24741 100644 > --- a/nptl/pthread_create.c > +++ b/nptl/pthread_create.c > @@ -691,7 +691,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr, > > /* Inherit rseq registration state. Without seccomp filters, rseq > registration will either always fail or always succeed. */ > - if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0) > + if ((int) RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id) >= 0) > pd->flags |= ATTR_FLAG_DO_RSEQ; > > /* Initialize the field for the ID of the thread which is waiting > diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h > index 9b50ddd09f..f8a3628bc0 100644 > --- a/sysdeps/generic/ldsodefs.h > +++ b/sysdeps/generic/ldsodefs.h > @@ -610,6 +610,12 @@ struct rtld_global_ro > See comments in elf/dl-tls.c where it is initialized. */ > EXTERN size_t _dl_tls_static_surplus; > > + /* Offset of the rseq area from the thread pointer. */ > + EXTERN ptrdiff_t _dl_tls_rseq_offset; > + > + /* Size of the rseq area in the static TLS block. */ > + EXTERN size_t _dl_tls_rseq_size; > + > /* Name of the shared object to be profiled (if any). */ > EXTERN const char *_dl_profile; > /* Filename of the output file. */ > diff --git a/sysdeps/i386/nptl/tcb-access.h b/sysdeps/i386/nptl/tcb-access.h > index 28f0a5625f..f1bbccab6e 100644 > --- a/sysdeps/i386/nptl/tcb-access.h > +++ b/sysdeps/i386/nptl/tcb-access.h > @@ -123,3 +123,57 @@ > "i" (offsetof (struct pthread, member)), \ > "r" (idx)); \ > }}) > + > + > +/* Read member of the RSEQ area directly. */ > +#define RSEQ_GETMEM_VOLATILE(descr, member) \ > + ({ __typeof (descr->member) __value; \ > + _Static_assert (sizeof (__value) == 1 \ > + || sizeof (__value) == 4 \ > + || sizeof (__value) == 8, \ > + "size of per-thread data"); \ > + if (sizeof (__value) == 1) \ > + asm volatile ("movb %%gs:%P2(%3),%b0" \ > + : "=q" (__value) \ > + : "0" (0), "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else if (sizeof (__value) == 4) \ > + asm volatile ("movl %%gs:%P1(%2),%0" \ > + : "=r" (__value) \ > + : "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else /* 8 */ \ > + { \ > + asm volatile ("movl %%gs:%P1(%2),%%eax\n\t" \ > + "movl %%gs:4+%P1(%2),%%edx" \ > + : "=&A" (__value) \ > + : "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + } \ > + __value; }) > + > +/* Set member of the RSEQ area directly. */ > +#define RSEQ_SETMEM(descr, member, value) \ > + ({ \ > + _Static_assert (sizeof (descr->member) == 1 \ > + || sizeof (descr->member) == 4 \ > + || sizeof (descr->member) == 8, \ > + "size of per-thread data"); \ > + if (sizeof (descr->member) == 1) \ > + asm volatile ("movb %b0,%%gs:%P1(%2)" : \ > + : "iq" (value), \ > + "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else if (sizeof (descr->member) == 4) \ > + asm volatile ("movl %0,%%gs:%P1(%2)" : \ > + : "ir" (value), \ > + "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else /* 8 */ \ > + { \ > + asm volatile ("movl %%eax,%%gs:%P1(%2)\n\t" \ > + "movl %%edx,%%gs:4+%P1(%2)" : \ > + : "A" ((uint64_t) cast_to_integer (value)), \ > + "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + }}) > diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c > index 2ed98c5a31..406f6ba716 100644 > --- a/sysdeps/nptl/dl-tls_init_tp.c > +++ b/sysdeps/nptl/dl-tls_init_tp.c > @@ -108,7 +108,7 @@ __tls_init_tp (void) > /* We need a writable view of the variables. They are in > .data.relro and are not yet write-protected. */ > extern unsigned int size __asm__ ("__rseq_size"); > - size = sizeof (pd->rseq_area); > + size = GLRO (dl_tls_rseq_size); > } > > #ifdef RSEQ_SIG > @@ -118,7 +118,7 @@ __tls_init_tp (void) > if the rseq registration may have happened because RSEQ_SIG is > defined. */ > extern ptrdiff_t offset __asm__ ("__rseq_offset"); > - offset = (char *) &pd->rseq_area - (char *) __thread_pointer (); > + offset = GLRO (dl_tls_rseq_offset); > #endif > } > > diff --git a/sysdeps/nptl/tcb-access.h b/sysdeps/nptl/tcb-access.h > index d6dfd41391..7092428d38 100644 > --- a/sysdeps/nptl/tcb-access.h > +++ b/sysdeps/nptl/tcb-access.h > @@ -30,3 +30,8 @@ > descr->member = (value) > #define THREAD_SETMEM_NC(descr, member, idx, value) \ > descr->member[idx] = (value) > + > +#define RSEQ_GETMEM_VOLATILE(descr, member) \ > + THREAD_GETMEM_VOLATILE(descr, member) > +#define RSEQ_SETMEM(descr, member, value) \ > + THREAD_SETMEM(descr, member, value) > diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h > index 294880c04e..a944c71421 100644 > --- a/sysdeps/unix/sysv/linux/rseq-internal.h > +++ b/sysdeps/unix/sysv/linux/rseq-internal.h > @@ -24,6 +24,24 @@ > #include <stdbool.h> > #include <stdio.h> > #include <sys/rseq.h> > +#include <thread_pointer.h> > +#include <ldsodefs.h> > + > +/* rseq area registered with the kernel. Use a custom definition > + here to isolate from kernel struct rseq changes. The > + implementation of sched_getcpu needs acccess to the cpu_id field; > + the other fields are unused and not included here. */ > +struct rseq_area > +{ > + uint32_t cpu_id_start; > + uint32_t cpu_id; > +}; > + > +static inline struct rseq_area * > +rseq_get_area(void) > +{ > + return (struct rseq_area *) ((char *) __thread_pointer() + GLRO (dl_tls_rseq_offset)); > +} > > #ifdef RSEQ_SIG > static inline bool > @@ -31,20 +49,20 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) > { > if (do_rseq) > { > - int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area, > - sizeof (self->rseq_area), > + int ret = INTERNAL_SYSCALL_CALL (rseq, rseq_get_area(), > + GLRO (dl_tls_rseq_size), > 0, RSEQ_SIG); > if (!INTERNAL_SYSCALL_ERROR_P (ret)) > return true; > } > - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); > + RSEQ_SETMEM (rseq_get_area(), cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); > return false; > } > #else /* RSEQ_SIG */ > static inline bool > rseq_register_current_thread (struct pthread *self, bool do_rseq) > { > - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); > + RSEQ_SETMEM (rseq_get_area(), cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); > return false; > } > #endif /* RSEQ_SIG */ > diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c b/sysdeps/unix/sysv/linux/sched_getcpu.c > index 4457d714bc..6109c68625 100644 > --- a/sysdeps/unix/sysv/linux/sched_getcpu.c > +++ b/sysdeps/unix/sysv/linux/sched_getcpu.c > @@ -19,6 +19,7 @@ > #include <sched.h> > #include <sysdep.h> > #include <sysdep-vdso.h> > +#include <rseq-internal.h> > > static int > vsyscall_sched_getcpu (void) > @@ -37,7 +38,7 @@ vsyscall_sched_getcpu (void) > int > sched_getcpu (void) > { > - int cpu_id = THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id); > + int cpu_id = RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id); > return __glibc_likely (cpu_id >= 0) ? cpu_id : vsyscall_sched_getcpu (); > } > #else /* RSEQ_SIG */ > diff --git a/sysdeps/unix/sysv/linux/tst-rseq.c b/sysdeps/unix/sysv/linux/tst-rseq.c > index 16983503b1..a6af0e783f 100644 > --- a/sysdeps/unix/sysv/linux/tst-rseq.c > +++ b/sysdeps/unix/sysv/linux/tst-rseq.c > @@ -31,18 +31,20 @@ > # include <syscall.h> > # include <thread_pointer.h> > # include <tls.h> > +# include <sys/auxv.h> > # include "tst-rseq.h" > > static void > do_rseq_main_test (void) > { > - struct pthread *pd = THREAD_SELF; > + size_t rseq_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), 32); > > TEST_VERIFY_EXIT (rseq_thread_registered ()); > TEST_COMPARE (__rseq_flags, 0); > - TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset > - == (char *) &pd->rseq_area); > - TEST_COMPARE (__rseq_size, sizeof (pd->rseq_area)); > + //FIXME: unsure how to test this > + //TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset > + // == (char *) &pd->rseq_area); > + TEST_COMPARE (__rseq_size, rseq_size); > } > > static void > diff --git a/sysdeps/unix/sysv/linux/tst-rseq.h b/sysdeps/unix/sysv/linux/tst-rseq.h > index 95d12048df..82ce90b62c 100644 > --- a/sysdeps/unix/sysv/linux/tst-rseq.h > +++ b/sysdeps/unix/sysv/linux/tst-rseq.h > @@ -23,11 +23,12 @@ > #include <syscall.h> > #include <sys/rseq.h> > #include <tls.h> > +#include <rseq-internal.h> > > static inline bool > rseq_thread_registered (void) > { > - return THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id) >= 0; > + return RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id) >= 0; > } > > static inline int > diff --git a/sysdeps/x86_64/nptl/tcb-access.h b/sysdeps/x86_64/nptl/tcb-access.h > index 110b1be44d..d45413b1e3 100644 > --- a/sysdeps/x86_64/nptl/tcb-access.h > +++ b/sysdeps/x86_64/nptl/tcb-access.h > @@ -67,7 +67,6 @@ > } \ > __value; }) > > - > /* Loading addresses of objects on x86-64 needs to be treated special > when generating PIC code. */ > #ifdef __pic__ > @@ -130,3 +129,57 @@ > "i" (offsetof (struct pthread, member[0])), \ > "r" (idx)); \ > }}) > + > +/* Read member of the RSEQ area directly. */ > +# define RSEQ_GETMEM_VOLATILE(descr, member) \ > + ({ __typeof (descr->member) __value; \ > + _Static_assert (sizeof (__value) == 1 \ > + || sizeof (__value) == 4 \ > + || sizeof (__value) == 8, \ > + "size of per-thread data"); \ > + if (sizeof (__value) == 1) \ > + asm volatile ("movb %%fs:%P2(%q3),%b0" \ > + : "=q" (__value) \ > + : "0" (0), "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else if (sizeof (__value) == 4) \ > + asm volatile ("movl %%fs:%P1(%q2),%0" \ > + : "=r" (__value) \ > + : "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else /* 8 */ \ > + { \ > + asm volatile ("movq %%fs:%P1(%q2),%q0" \ > + : "=r" (__value) \ > + : "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + } \ > + __value; }) > + > +/* Set member of the RSEQ area directly. */ > +# define RSEQ_SETMEM(descr, member, value) \ > + ({ \ > + _Static_assert (sizeof (descr->member) == 1 \ > + || sizeof (descr->member) == 4 \ > + || sizeof (descr->member) == 8, \ > + "size of per-thread data"); \ > + if (sizeof (descr->member) == 1) \ > + asm volatile ("movb %b0,%%fs:%P1(%q2)" : \ > + : "iq" (value), \ > + "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else if (sizeof (descr->member) == 4) \ > + asm volatile ("movl %0,%%fs:%P1(%q2)" : \ > + : IMM_MODE (value), \ > + "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + else /* 8 */ \ > + { \ > + /* Since movq takes a signed 32-bit immediate or a register source \ > + operand, use "er" constraint for 32-bit signed integer constant \ > + or register. */ \ > + asm volatile ("movq %q0,%%fs:%P1(%q2)" : \ > + : "er" ((uint64_t) cast_to_integer (value)), \ > + "i" (offsetof (struct rseq_area, member)), \ > + "r" (__rseq_offset)); \ > + }})
diff --git a/csu/libc-tls.c b/csu/libc-tls.c index cdf6442c02..8458f47577 100644 --- a/csu/libc-tls.c +++ b/csu/libc-tls.c @@ -62,6 +62,12 @@ size_t _dl_tls_static_surplus; dynamic TLS access (e.g. with TLSDESC). */ size_t _dl_tls_static_optional; +/* Offset of the rseq area from the thread pointer. */ +ptrdiff_t _dl_tls_rseq_offset; + +/* Size of the rseq area in the static TLS block. */ +size_t _dl_tls_rseq_size; + /* Generation counter for the dtv. */ size_t _dl_tls_generation; diff --git a/elf/dl-tls.c b/elf/dl-tls.c index c192b5a13a..e9390f87fd 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -24,6 +24,7 @@ #include <unistd.h> #include <sys/param.h> #include <atomic.h> +#include <sys/auxv.h> #include <tls.h> #include <dl-tls.h> @@ -75,6 +76,12 @@ /* Default for dl_tls_static_optional. */ #define OPTIONAL_TLS 512 +/* Minimum size of the rseq area. */ +#define TLS_DL_RSEQ_MIN_SIZE 32 + +/* Minimum size of the rseq area alignment. */ +#define TLS_DL_RSEQ_MIN_ALIGN 32 + /* Compute the static TLS surplus based on the namespace count and the TLS space that can be used for optimizations. */ static inline int @@ -297,6 +304,29 @@ _dl_determine_tlsoffset (void) slotinfo[cnt].map->l_tls_offset = off; } + /* Insert the rseq area block after the last TLS block. */ + + /* Get the rseq auxiliary vectors, 0 is returned when not implemented + and we then default to the rseq ABI minimums. */ + size_t rseq_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), TLS_DL_RSEQ_MIN_SIZE); + size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), TLS_DL_RSEQ_MIN_ALIGN); + + /* Make sure the rseq area size is a multiple of the requested + aligment. */ + rseq_size = roundup (rseq_size, rseq_align); + + /* Add the rseq area block to the global offset. */ + offset = roundup (offset, rseq_align) + rseq_size; + + /* Increase the max_align if necessary. */ + max_align = MAX (max_align, rseq_align); + + /* Record the rseq_area block size and offset. The offset is negative + with TLS_TCB_AT_TP because the TLS blocks are located before the + thread pointer. */ + GLRO (dl_tls_rseq_offset) = -offset; + GLRO (dl_tls_rseq_size) = rseq_size; + GL(dl_tls_static_used) = offset; GLRO (dl_tls_static_size) = (roundup (offset + GLRO(dl_tls_static_surplus), max_align) @@ -342,6 +372,32 @@ _dl_determine_tlsoffset (void) offset = off + slotinfo[cnt].map->l_tls_blocksize - firstbyte; } + /* Insert the rseq area block after the last TLS block. */ + + /* Get the rseq auxiliary vectors, 0 is returned when not implemented + and we then default to the rseq ABI minimums. */ + size_t rseq_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), TLS_DL_RSEQ_MIN_SIZE); + size_t rseq_align = MAX (getauxval (AT_RSEQ_ALIGN), TLS_DL_RSEQ_MIN_ALIGN); + + /* Make sure the rseq area size is a multiple of the requested + aligment. */ + rseq_size = roundup (rseq_size, rseq_align); + + /* Align the global offset to the beginning of the rseq area. */ + offset = roundup (offset, rseq_align); + + /* Record the rseq_area block size and offset. The offset is positive + with TLS_DTV_AT_TP because the TLS blocks are located after the + thread pointer. */ + GLRO (dl_tls_rseq_size) = rseq_size; + GLRO (dl_tls_rseq_offset) = offset; + + /* Add the rseq area block to the global offset. */ + offset += rseq_size; + + /* Increase the max_align if necessary. */ + max_align = MAX (max_align, rseq_align); + GL(dl_tls_static_used) = offset; GLRO (dl_tls_static_size) = roundup (offset + GLRO(dl_tls_static_surplus), TCB_ALIGNMENT); diff --git a/elf/rtld_static_init.c b/elf/rtld_static_init.c index aec8cc056b..84548023ab 100644 --- a/elf/rtld_static_init.c +++ b/elf/rtld_static_init.c @@ -78,6 +78,12 @@ __rtld_static_init (struct link_map *map) extern __typeof (dl->_dl_tls_static_size) _dl_tls_static_size attribute_hidden; dl->_dl_tls_static_size = _dl_tls_static_size; + extern __typeof (dl->_dl_tls_rseq_size) _dl_tls_rseq_size + attribute_hidden; + dl->_dl_tls_rseq_size = _dl_tls_rseq_size; + extern __typeof (dl->_dl_tls_rseq_offset) _dl_tls_rseq_offset + attribute_hidden; + dl->_dl_tls_rseq_offset = _dl_tls_rseq_offset; dl->_dl_find_object = _dl_find_object; __rtld_static_init_arch (map, dl); diff --git a/nptl/descr.h b/nptl/descr.h index 0171576c23..fadba1f619 100644 --- a/nptl/descr.h +++ b/nptl/descr.h @@ -404,25 +404,11 @@ struct pthread /* Used on strsignal. */ struct tls_internal_t tls_state; - /* rseq area registered with the kernel. Use a custom definition - here to isolate from kernel struct rseq changes. The - implementation of sched_getcpu needs acccess to the cpu_id field; - the other fields are unused and not included here. */ - union - { - struct - { - uint32_t cpu_id_start; - uint32_t cpu_id; - }; - char pad[32]; /* Original rseq area size. */ - } rseq_area __attribute__ ((aligned (32))); - /* Amount of end padding, if any, in this structure. - This definition relies on rseq_area being last. */ + This definition relies on tls_state being last. */ #define PTHREAD_STRUCT_END_PADDING \ - (sizeof (struct pthread) - offsetof (struct pthread, rseq_area) \ - + sizeof ((struct pthread) {}.rseq_area)) + (sizeof (struct pthread) - offsetof (struct pthread, tls_state) \ + + sizeof ((struct pthread) {}.tls_state)) } __attribute ((aligned (TCB_ALIGNMENT))); static inline bool diff --git a/nptl/pthread_create.c b/nptl/pthread_create.c index 63cb684f04..4ee0f24741 100644 --- a/nptl/pthread_create.c +++ b/nptl/pthread_create.c @@ -691,7 +691,7 @@ __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr, /* Inherit rseq registration state. Without seccomp filters, rseq registration will either always fail or always succeed. */ - if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0) + if ((int) RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id) >= 0) pd->flags |= ATTR_FLAG_DO_RSEQ; /* Initialize the field for the ID of the thread which is waiting diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index 9b50ddd09f..f8a3628bc0 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -610,6 +610,12 @@ struct rtld_global_ro See comments in elf/dl-tls.c where it is initialized. */ EXTERN size_t _dl_tls_static_surplus; + /* Offset of the rseq area from the thread pointer. */ + EXTERN ptrdiff_t _dl_tls_rseq_offset; + + /* Size of the rseq area in the static TLS block. */ + EXTERN size_t _dl_tls_rseq_size; + /* Name of the shared object to be profiled (if any). */ EXTERN const char *_dl_profile; /* Filename of the output file. */ diff --git a/sysdeps/i386/nptl/tcb-access.h b/sysdeps/i386/nptl/tcb-access.h index 28f0a5625f..f1bbccab6e 100644 --- a/sysdeps/i386/nptl/tcb-access.h +++ b/sysdeps/i386/nptl/tcb-access.h @@ -123,3 +123,57 @@ "i" (offsetof (struct pthread, member)), \ "r" (idx)); \ }}) + + +/* Read member of the RSEQ area directly. */ +#define RSEQ_GETMEM_VOLATILE(descr, member) \ + ({ __typeof (descr->member) __value; \ + _Static_assert (sizeof (__value) == 1 \ + || sizeof (__value) == 4 \ + || sizeof (__value) == 8, \ + "size of per-thread data"); \ + if (sizeof (__value) == 1) \ + asm volatile ("movb %%gs:%P2(%3),%b0" \ + : "=q" (__value) \ + : "0" (0), "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else if (sizeof (__value) == 4) \ + asm volatile ("movl %%gs:%P1(%2),%0" \ + : "=r" (__value) \ + : "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else /* 8 */ \ + { \ + asm volatile ("movl %%gs:%P1(%2),%%eax\n\t" \ + "movl %%gs:4+%P1(%2),%%edx" \ + : "=&A" (__value) \ + : "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + } \ + __value; }) + +/* Set member of the RSEQ area directly. */ +#define RSEQ_SETMEM(descr, member, value) \ + ({ \ + _Static_assert (sizeof (descr->member) == 1 \ + || sizeof (descr->member) == 4 \ + || sizeof (descr->member) == 8, \ + "size of per-thread data"); \ + if (sizeof (descr->member) == 1) \ + asm volatile ("movb %b0,%%gs:%P1(%2)" : \ + : "iq" (value), \ + "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else if (sizeof (descr->member) == 4) \ + asm volatile ("movl %0,%%gs:%P1(%2)" : \ + : "ir" (value), \ + "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else /* 8 */ \ + { \ + asm volatile ("movl %%eax,%%gs:%P1(%2)\n\t" \ + "movl %%edx,%%gs:4+%P1(%2)" : \ + : "A" ((uint64_t) cast_to_integer (value)), \ + "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + }}) diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c index 2ed98c5a31..406f6ba716 100644 --- a/sysdeps/nptl/dl-tls_init_tp.c +++ b/sysdeps/nptl/dl-tls_init_tp.c @@ -108,7 +108,7 @@ __tls_init_tp (void) /* We need a writable view of the variables. They are in .data.relro and are not yet write-protected. */ extern unsigned int size __asm__ ("__rseq_size"); - size = sizeof (pd->rseq_area); + size = GLRO (dl_tls_rseq_size); } #ifdef RSEQ_SIG @@ -118,7 +118,7 @@ __tls_init_tp (void) if the rseq registration may have happened because RSEQ_SIG is defined. */ extern ptrdiff_t offset __asm__ ("__rseq_offset"); - offset = (char *) &pd->rseq_area - (char *) __thread_pointer (); + offset = GLRO (dl_tls_rseq_offset); #endif } diff --git a/sysdeps/nptl/tcb-access.h b/sysdeps/nptl/tcb-access.h index d6dfd41391..7092428d38 100644 --- a/sysdeps/nptl/tcb-access.h +++ b/sysdeps/nptl/tcb-access.h @@ -30,3 +30,8 @@ descr->member = (value) #define THREAD_SETMEM_NC(descr, member, idx, value) \ descr->member[idx] = (value) + +#define RSEQ_GETMEM_VOLATILE(descr, member) \ + THREAD_GETMEM_VOLATILE(descr, member) +#define RSEQ_SETMEM(descr, member, value) \ + THREAD_SETMEM(descr, member, value) diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h index 294880c04e..a944c71421 100644 --- a/sysdeps/unix/sysv/linux/rseq-internal.h +++ b/sysdeps/unix/sysv/linux/rseq-internal.h @@ -24,6 +24,24 @@ #include <stdbool.h> #include <stdio.h> #include <sys/rseq.h> +#include <thread_pointer.h> +#include <ldsodefs.h> + +/* rseq area registered with the kernel. Use a custom definition + here to isolate from kernel struct rseq changes. The + implementation of sched_getcpu needs acccess to the cpu_id field; + the other fields are unused and not included here. */ +struct rseq_area +{ + uint32_t cpu_id_start; + uint32_t cpu_id; +}; + +static inline struct rseq_area * +rseq_get_area(void) +{ + return (struct rseq_area *) ((char *) __thread_pointer() + GLRO (dl_tls_rseq_offset)); +} #ifdef RSEQ_SIG static inline bool @@ -31,20 +49,20 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) { if (do_rseq) { - int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area, - sizeof (self->rseq_area), + int ret = INTERNAL_SYSCALL_CALL (rseq, rseq_get_area(), + GLRO (dl_tls_rseq_size), 0, RSEQ_SIG); if (!INTERNAL_SYSCALL_ERROR_P (ret)) return true; } - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); + RSEQ_SETMEM (rseq_get_area(), cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); return false; } #else /* RSEQ_SIG */ static inline bool rseq_register_current_thread (struct pthread *self, bool do_rseq) { - THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); + RSEQ_SETMEM (rseq_get_area(), cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); return false; } #endif /* RSEQ_SIG */ diff --git a/sysdeps/unix/sysv/linux/sched_getcpu.c b/sysdeps/unix/sysv/linux/sched_getcpu.c index 4457d714bc..6109c68625 100644 --- a/sysdeps/unix/sysv/linux/sched_getcpu.c +++ b/sysdeps/unix/sysv/linux/sched_getcpu.c @@ -19,6 +19,7 @@ #include <sched.h> #include <sysdep.h> #include <sysdep-vdso.h> +#include <rseq-internal.h> static int vsyscall_sched_getcpu (void) @@ -37,7 +38,7 @@ vsyscall_sched_getcpu (void) int sched_getcpu (void) { - int cpu_id = THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id); + int cpu_id = RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id); return __glibc_likely (cpu_id >= 0) ? cpu_id : vsyscall_sched_getcpu (); } #else /* RSEQ_SIG */ diff --git a/sysdeps/unix/sysv/linux/tst-rseq.c b/sysdeps/unix/sysv/linux/tst-rseq.c index 16983503b1..a6af0e783f 100644 --- a/sysdeps/unix/sysv/linux/tst-rseq.c +++ b/sysdeps/unix/sysv/linux/tst-rseq.c @@ -31,18 +31,20 @@ # include <syscall.h> # include <thread_pointer.h> # include <tls.h> +# include <sys/auxv.h> # include "tst-rseq.h" static void do_rseq_main_test (void) { - struct pthread *pd = THREAD_SELF; + size_t rseq_size = MAX (getauxval (AT_RSEQ_FEATURE_SIZE), 32); TEST_VERIFY_EXIT (rseq_thread_registered ()); TEST_COMPARE (__rseq_flags, 0); - TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset - == (char *) &pd->rseq_area); - TEST_COMPARE (__rseq_size, sizeof (pd->rseq_area)); + //FIXME: unsure how to test this + //TEST_VERIFY ((char *) __thread_pointer () + __rseq_offset + // == (char *) &pd->rseq_area); + TEST_COMPARE (__rseq_size, rseq_size); } static void diff --git a/sysdeps/unix/sysv/linux/tst-rseq.h b/sysdeps/unix/sysv/linux/tst-rseq.h index 95d12048df..82ce90b62c 100644 --- a/sysdeps/unix/sysv/linux/tst-rseq.h +++ b/sysdeps/unix/sysv/linux/tst-rseq.h @@ -23,11 +23,12 @@ #include <syscall.h> #include <sys/rseq.h> #include <tls.h> +#include <rseq-internal.h> static inline bool rseq_thread_registered (void) { - return THREAD_GETMEM_VOLATILE (THREAD_SELF, rseq_area.cpu_id) >= 0; + return RSEQ_GETMEM_VOLATILE (rseq_get_area(), cpu_id) >= 0; } static inline int diff --git a/sysdeps/x86_64/nptl/tcb-access.h b/sysdeps/x86_64/nptl/tcb-access.h index 110b1be44d..d45413b1e3 100644 --- a/sysdeps/x86_64/nptl/tcb-access.h +++ b/sysdeps/x86_64/nptl/tcb-access.h @@ -67,7 +67,6 @@ } \ __value; }) - /* Loading addresses of objects on x86-64 needs to be treated special when generating PIC code. */ #ifdef __pic__ @@ -130,3 +129,57 @@ "i" (offsetof (struct pthread, member[0])), \ "r" (idx)); \ }}) + +/* Read member of the RSEQ area directly. */ +# define RSEQ_GETMEM_VOLATILE(descr, member) \ + ({ __typeof (descr->member) __value; \ + _Static_assert (sizeof (__value) == 1 \ + || sizeof (__value) == 4 \ + || sizeof (__value) == 8, \ + "size of per-thread data"); \ + if (sizeof (__value) == 1) \ + asm volatile ("movb %%fs:%P2(%q3),%b0" \ + : "=q" (__value) \ + : "0" (0), "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else if (sizeof (__value) == 4) \ + asm volatile ("movl %%fs:%P1(%q2),%0" \ + : "=r" (__value) \ + : "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else /* 8 */ \ + { \ + asm volatile ("movq %%fs:%P1(%q2),%q0" \ + : "=r" (__value) \ + : "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + } \ + __value; }) + +/* Set member of the RSEQ area directly. */ +# define RSEQ_SETMEM(descr, member, value) \ + ({ \ + _Static_assert (sizeof (descr->member) == 1 \ + || sizeof (descr->member) == 4 \ + || sizeof (descr->member) == 8, \ + "size of per-thread data"); \ + if (sizeof (descr->member) == 1) \ + asm volatile ("movb %b0,%%fs:%P1(%q2)" : \ + : "iq" (value), \ + "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else if (sizeof (descr->member) == 4) \ + asm volatile ("movl %0,%%fs:%P1(%q2)" : \ + : IMM_MODE (value), \ + "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + else /* 8 */ \ + { \ + /* Since movq takes a signed 32-bit immediate or a register source \ + operand, use "er" constraint for 32-bit signed integer constant \ + or register. */ \ + asm volatile ("movq %q0,%%fs:%P1(%q2)" : \ + : "er" ((uint64_t) cast_to_integer (value)), \ + "i" (offsetof (struct rseq_area, member)), \ + "r" (__rseq_offset)); \ + }})