Message ID | CAMe9rOp15N_5xrbx4uCdKY7v9iMiN7gQ0ZBOvaoEJOO2ZLM+TA@mail.gmail.com |
---|---|
State | New |
Headers | show |
On 07/04/2017 05:16 PM, H.J. Lu wrote: >> Furthermore, the code GCC generates for stack realignment is really bad, > > GCC generates very good code for stack realignment when > -maccumulate-outgoing-args is used. I wonder why that isn't enabled by the force attribute. >> # define __tls_get_addr __tls_get_addr_default >> +# include <elf/dl-tls.c> >> + >> +# undef __tls_get_addr_default > ^^^^^^^ Shouldn't it be __tls_get_addr? Looks this way. I'd have to re-test and see if it makes a difference. >> -typedef struct dl_tls_index >> -{ >> - uint64_t ti_module; >> - uint64_t ti_offset; >> -} tls_index; > Is this sysdeps/x86_64/dl-tlsdesc.h change related to this? It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from the same file, causing a multiple definition error without the removal of the conflicting definition. > __tls_get_addr_compat: > + .type __tls_get_addr_compat,@function > + .global __tls_get_addr_compat > + strong_alias (__tls_get_addr_compat, __tls_get_addr) > > We can use ENTRY/END here. Why do we need __tls_get_addr_compat? > Can we just have __tls_get_addr? That confuses the linker once we add the .symver directive. > Since we are talking performance here, we should add __tls_get_addr_slow > to only handle slow paths. I'd prefer something that adds a new symbol version for the non-aligning implementation, so that we eventually move away from the aligning one. > Here is the patch which implements those. It is tested on x86-64 > and x32. Is this really needed on x32, too? Did GCC have the same bug? Thanks, Florian
On Tue, Jul 4, 2017 at 8:47 AM, Florian Weimer <fweimer@redhat.com> wrote: > On 07/04/2017 05:16 PM, H.J. Lu wrote: > >>> Furthermore, the code GCC generates for stack realignment is really bad, >> >> GCC generates very good code for stack realignment when >> -maccumulate-outgoing-args is used. > > I wonder why that isn't enabled by the force attribute. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81312 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81313 >>> # define __tls_get_addr __tls_get_addr_default >>> +# include <elf/dl-tls.c> >>> + >>> +# undef __tls_get_addr_default >> ^^^^^^^ Shouldn't it be __tls_get_addr? > > Looks this way. I'd have to re-test and see if it makes a difference. > >>> -typedef struct dl_tls_index >>> -{ >>> - uint64_t ti_module; >>> - uint64_t ti_offset; >>> -} tls_index; > >> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this? > > It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from > the same file, causing a multiple definition error without the removal > of the conflicting definition. It is a cleanup. But I don't believe it is required here. >> __tls_get_addr_compat: >> + .type __tls_get_addr_compat,@function >> + .global __tls_get_addr_compat >> + strong_alias (__tls_get_addr_compat, __tls_get_addr) >> >> We can use ENTRY/END here. Why do we need __tls_get_addr_compat? >> Can we just have __tls_get_addr? > > That confuses the linker once we add the .symver directive. I don't think it is a case. We just define a different __tls_get_addr for x86-64. >> Since we are talking performance here, we should add __tls_get_addr_slow >> to only handle slow paths. > > I'd prefer something that adds a new symbol version for the non-aligning > implementation, so that we eventually move away from the aligning one. I thought about it. There is no easy way to do it without linker help. We can add ___tls_get_addr, similar to i386, which will take an aligned stack. Linker must support ___tls_get_addr. Then we can use weakref to redirect __tls_get_addr to ___tls_get_addr if linker supports ___tls_get_addr and GCC doesn't. >> Here is the patch which implements those. It is tested on x86-64 >> and x32. > > Is this really needed on x32, too? Did GCC have the same bug? > Yes, it is needed for x32 since the bug is only fixed on I thiuGCC 4.9 branch and above.
On Tue, Jul 4, 2017 at 8:55 AM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Tue, Jul 4, 2017 at 8:47 AM, Florian Weimer <fweimer@redhat.com> wrote: >> On 07/04/2017 05:16 PM, H.J. Lu wrote: >> >>>> Furthermore, the code GCC generates for stack realignment is really bad, >>> >>> GCC generates very good code for stack realignment when >>> -maccumulate-outgoing-args is used. >> >> I wonder why that isn't enabled by the force attribute. > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81312 > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81313 > >>>> # define __tls_get_addr __tls_get_addr_default >>>> +# include <elf/dl-tls.c> >>>> + >>>> +# undef __tls_get_addr_default >>> ^^^^^^^ Shouldn't it be __tls_get_addr? >> >> Looks this way. I'd have to re-test and see if it makes a difference. >> >>>> -typedef struct dl_tls_index >>>> -{ >>>> - uint64_t ti_module; >>>> - uint64_t ti_offset; >>>> -} tls_index; >> >>> Is this sysdeps/x86_64/dl-tlsdesc.h change related to this? >> >> It is because the patch includes both <dl-tls.h> and <dl-tlsdesc.h> from >> the same file, causing a multiple definition error without the removal >> of the conflicting definition. > > It is a cleanup. But I don't believe it is required here. > >>> __tls_get_addr_compat: >>> + .type __tls_get_addr_compat,@function >>> + .global __tls_get_addr_compat >>> + strong_alias (__tls_get_addr_compat, __tls_get_addr) >>> >>> We can use ENTRY/END here. Why do we need __tls_get_addr_compat? >>> Can we just have __tls_get_addr? >> >> That confuses the linker once we add the .symver directive. > > I don't think it is a case. We just define a different __tls_get_addr for > x86-64. > >>> Since we are talking performance here, we should add __tls_get_addr_slow >>> to only handle slow paths. >> >> I'd prefer something that adds a new symbol version for the non-aligning >> implementation, so that we eventually move away from the aligning one. > > I thought about it. There is no easy way to do it without linker help. We > can add ___tls_get_addr, similar to i386, which will take an aligned > stack. Linker must support ___tls_get_addr. Then we can use weakref > to redirect __tls_get_addr to ___tls_get_addr if linker supports > ___tls_get_addr and GCC doesn't. > >>> Here is the patch which implements those. It is tested on x86-64 >>> and x32. >> >> Is this really needed on x32, too? Did GCC have the same bug? >> > > Yes, it is needed for x32 since the bug is only fixed on I thiuGCC 4.9 branch > and above. > > -- > H.J.
From eb4dd17f2f8222278567dc085071bd85c6669cc4 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Mon, 3 Jul 2017 13:16:57 -0700 Subject: [PATCH] x86-64: Align the stack in __tls_get_addr [BZ #21609] This change forces realignment of the stack pointer in __tls_get_addr, so that binaries compiled by GCCs older than GCC 4.9: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066 continue to work even if vector instructions are used in glibc which require the ABI stack realignment. __tls_get_addr_slow is added to handle the slow paths in the default implementation of__tls_get_addr in elf/dl-tls.c. The new __tls_get_addr calls __tls_get_addr_slow after realigning the stack. Internal calls within ld.so go directly to the default implementation of __tls_get_addr because they do not need stack realignment. 2017-07-04 Florian Weimer <fweimer@redhat.com> H.J. Lu <hongjiu.lu@intel.com> [BZ #21609] * sysdeps/x86_64/Makefile (sysdep-dl-routines): Add tls_get_addr. (gen-as-const-headers): Add rtld-offsets.sym. * sysdeps/x86_64/dl-tls.c: New file. * sysdeps/x86_64/rtld-offsets.sym: Likwise. * sysdeps/x86_64/tls_get_addr.S: Likewise. * sysdeps/x86_64/dl-tls.h: Add multiple inclusion guards. * sysdeps/x86_64/tlsdesc.sym (TI_MODULE_OFFSET): New. (TI_OFFSET_OFFSET): Likwise. --- sysdeps/x86_64/Makefile | 4 +-- sysdeps/x86_64/dl-tls.c | 53 +++++++++++++++++++++++++++++++++++ sysdeps/x86_64/dl-tls.h | 5 ++++ sysdeps/x86_64/rtld-offsets.sym | 6 ++++ sysdeps/x86_64/tls_get_addr.S | 61 +++++++++++++++++++++++++++++++++++++++++ sysdeps/x86_64/tlsdesc.sym | 3 ++ 6 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 sysdeps/x86_64/dl-tls.c create mode 100644 sysdeps/x86_64/rtld-offsets.sym create mode 100644 sysdeps/x86_64/tls_get_addr.S diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 5075c91..132470d 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -27,7 +27,7 @@ ifeq ($(subdir),elf) CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\ -mno-mmx) -sysdep-dl-routines += tlsdesc dl-tlsdesc +sysdep-dl-routines += tlsdesc dl-tlsdesc tls_get_addr tests += ifuncmain8 modules-names += ifuncmod8 @@ -120,5 +120,5 @@ endif endif ifeq ($(subdir),csu) -gen-as-const-headers += tlsdesc.sym +gen-as-const-headers += tlsdesc.sym rtld-offsets.sym endif diff --git a/sysdeps/x86_64/dl-tls.c b/sysdeps/x86_64/dl-tls.c new file mode 100644 index 0000000..567224e --- /dev/null +++ b/sysdeps/x86_64/dl-tls.c @@ -0,0 +1,53 @@ +/* Thread-local storage handling in the ELF dynamic linker. x86-64 version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef SHARED +/* Work around GCC PR58066, due to which __tls_get_addr may be called + with an unaligned stack. The compat implementation is in + tls_get_addr-compat.S. */ + +# include <dl-tls.h> + +/* Define __tls_get_addr within elf/dl-tls.c under a different + name. */ +extern __typeof__ (__tls_get_addr) __tls_get_addr_default; + +# define __tls_get_addr __tls_get_addr_default +# include <elf/dl-tls.c> +# undef __tls_get_addr + +hidden_ver (__tls_get_addr_default, __tls_get_addr) + +/* Only handle slow paths for __tls_get_addr. */ +attribute_hidden +void * +__tls_get_addr_slow (GET_ADDR_ARGS) +{ + dtv_t *dtv = THREAD_DTV (); + + if (__glibc_unlikely (dtv[0].counter != GL(dl_tls_generation))) + return update_get_addr (GET_ADDR_PARAM); + + return tls_get_addr_tail (GET_ADDR_PARAM, dtv, NULL); +} +#else + +/* No compatibility symbol needed. */ +# include <elf/dl-tls.c> + +#endif diff --git a/sysdeps/x86_64/dl-tls.h b/sysdeps/x86_64/dl-tls.h index 4a59d2a..c2fb56c 100644 --- a/sysdeps/x86_64/dl-tls.h +++ b/sysdeps/x86_64/dl-tls.h @@ -16,6 +16,9 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ +#ifndef _X86_64_DL_TLS_H +#define _X86_64_DL_TLS_H + #include <stdint.h> /* Type used for the representation of TLS information in the GOT. */ @@ -27,3 +30,5 @@ typedef struct dl_tls_index extern void *__tls_get_addr (tls_index *ti); + +#endif /* _X86_64_DL_TLS_H */ diff --git a/sysdeps/x86_64/rtld-offsets.sym b/sysdeps/x86_64/rtld-offsets.sym new file mode 100644 index 0000000..fd41b51 --- /dev/null +++ b/sysdeps/x86_64/rtld-offsets.sym @@ -0,0 +1,6 @@ +#define SHARED +#include <ldsodefs.h> + +-- + +GL_TLS_GENERATION_OFFSET offsetof (struct rtld_global, _dl_tls_generation) diff --git a/sysdeps/x86_64/tls_get_addr.S b/sysdeps/x86_64/tls_get_addr.S new file mode 100644 index 0000000..9d38fb3 --- /dev/null +++ b/sysdeps/x86_64/tls_get_addr.S @@ -0,0 +1,61 @@ +/* Stack-aligning implementation of __tls_get_addr. x86-64 version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef SHARED + +# include <sysdep.h> +# include "tlsdesc.h" +# include "rtld-offsets.h" + +/* See __tls_get_addr and __tls_get_addr_slow in dl-tls.c. This function + call __tls_get_addr_slow on both slow paths. It realigns the stack + before the call to work around GCC PR58066. */ + +ENTRY (__tls_get_addr) + mov %fs:DTV_OFFSET, %RDX_LP + mov GL_TLS_GENERATION_OFFSET+_rtld_local(%rip), %RAX_LP + /* GL(dl_tls_generation) == dtv[0].counter */ + cmp %RAX_LP, (%rdx) + jne 1f + mov TI_MODULE_OFFSET(%rdi), %RAX_LP + /* dtv[ti->ti_module] */ +# ifdef __LP64__ + salq $4, %rax + movq (%rdx,%rax), %rax +# else + movl (%rdx,%rax, 8), %eax +# endif + cmp $-1, %RAX_LP + je 1f + add TI_OFFSET_OFFSET(%rdi), %RAX_LP + ret +1: + /* On the slow path, align the stack. */ + pushq %rbp + cfi_def_cfa_offset (16) + cfi_offset (%rbp, -16) + mov %RSP_LP, %RBP_LP + cfi_def_cfa_register (%rbp) + and $-16, %RSP_LP + call __tls_get_addr_slow + mov %RBP_LP, %RSP_LP + popq %rbp + cfi_def_cfa (%rsp, 8) + ret +END (__tls_get_addr) +#endif /* SHARED */ diff --git a/sysdeps/x86_64/tlsdesc.sym b/sysdeps/x86_64/tlsdesc.sym index 3385497..fc897ab 100644 --- a/sysdeps/x86_64/tlsdesc.sym +++ b/sysdeps/x86_64/tlsdesc.sym @@ -15,3 +15,6 @@ TLSDESC_ARG offsetof(struct tlsdesc, arg) TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count) TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module) TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset) + +TI_MODULE_OFFSET offsetof(tls_index, ti_module) +TI_OFFSET_OFFSET offsetof(tls_index, ti_offset) -- 2.9.4