Message ID | 20150711202742.GA9074@gmail.com |
---|---|
State | New |
Headers | show |
On Sat, Jul 11, 2015 at 1:27 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote: >> On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote: >> > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote: >> > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote: >> > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote: >> > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com> >> > >> >> wrote: >> > >> >> > Fixed in the attached patch >> > >> >> > >> > >> >> >> > >> >> I fixed some typos and updated sysdeps/i386/configure for >> > >> >> HAVE_MPX_SUPPORT. Please verify both with HAVE_MPX_SUPPORT and >> > >> >> without on i386 and x86-64. >> > >> > >> > >> > Done, all works fine >> > >> > >> > >> >> > >> I checked it in for you. >> > >> >> > > These are nice but you could have same problem with lazy tls allocation. >> > > I wrote patch to merge trampolines, which now conflicts. Could you write >> > > similar patch to solve that? Original purpose was to always save xmm >> > > registers so we could use sse2 routines which speeds up lookup time. >> > >> > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How >> > much gain it will give us? >> > >> I couldn't measure that without patch. Gain now would be big as we now >> use byte-by-byte loop to check symbol name which is slow, especially >> with c++ name mangling. Would be following benchmark good to measure >> speedup or do I need to measure startup time which is bit harder? >> > > Please try this. > > > H.J. > --- > sysdeps/x86_64/dl-trampoline.S | 42 +++++++++++++++++++++++++++++++++++------- > 1 file changed, 35 insertions(+), 7 deletions(-) > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S > index 678c57f..ea81d7d 100644 > --- a/sysdeps/x86_64/dl-trampoline.S > +++ b/sysdeps/x86_64/dl-trampoline.S > @@ -27,26 +27,36 @@ > /* Area on stack to save and restore registers used for parameter > passing when calling _dl_fixup. */ > #ifdef __ILP32__ > -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */ > -# define REGISTER_SAVE_AREA (8 * 7) > -# define REGISTER_SAVE_RAX 0 > +/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to > + XMM7. */ > +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 8) > +/* Align XMM register save area to 16 bytes. */ > +# define REGISTER_SAVE_XMM0 0 > # define PRESERVE_BND_REGS_PREFIX > #else > -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0, > - BND1, BND2, BND3. */ > -# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4) > +/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as > + BND0, BND1, BND2, BND3 and XMM0 to XMM7. */ > +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4 + 16 * 8) > /* Align bound register save area to 16 bytes. */ > # define REGISTER_SAVE_BND0 0 > # define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) > # define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) > # define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) > -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) > +# define REGISTER_SAVE_XMM0 (REGISTER_SAVE_BND3 + 16) > # ifdef HAVE_MPX_SUPPORT > # define PRESERVE_BND_REGS_PREFIX bnd > # else > # define PRESERVE_BND_REGS_PREFIX .byte 0xf2 > # endif > #endif > +#define REGISTER_SAVE_XMM1 (REGISTER_SAVE_XMM0 + 16) > +#define REGISTER_SAVE_XMM2 (REGISTER_SAVE_XMM1 + 16) > +#define REGISTER_SAVE_XMM3 (REGISTER_SAVE_XMM2 + 16) > +#define REGISTER_SAVE_XMM4 (REGISTER_SAVE_XMM3 + 16) > +#define REGISTER_SAVE_XMM5 (REGISTER_SAVE_XMM4 + 16) > +#define REGISTER_SAVE_XMM6 (REGISTER_SAVE_XMM5 + 16) > +#define REGISTER_SAVE_XMM7 (REGISTER_SAVE_XMM6 + 16) > +#define REGISTER_SAVE_RAX (REGISTER_SAVE_XMM7 + 16) > #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) > #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) > #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) > @@ -71,6 +81,15 @@ _dl_runtime_resolve: > movq %rdi, REGISTER_SAVE_RDI(%rsp) > movq %r8, REGISTER_SAVE_R8(%rsp) > movq %r9, REGISTER_SAVE_R9(%rsp) > + # movaps is 1-byte shorter. > + movaps %xmm0, REGISTER_SAVE_XMM0(%rsp) > + movaps %xmm1, REGISTER_SAVE_XMM1(%rsp) > + movaps %xmm2, REGISTER_SAVE_XMM2(%rsp) > + movaps %xmm3, REGISTER_SAVE_XMM3(%rsp) > + movaps %xmm4, REGISTER_SAVE_XMM4(%rsp) > + movaps %xmm5, REGISTER_SAVE_XMM5(%rsp) > + movaps %xmm6, REGISTER_SAVE_XMM6(%rsp) > + movaps %xmm7, REGISTER_SAVE_XMM7(%rsp) > #ifndef __ILP32__ > # We also have to preserve bound registers. These are nops if > # Intel MPX isn't available or disabled. > @@ -123,6 +142,15 @@ _dl_runtime_resolve: > movq REGISTER_SAVE_RDX(%rsp), %rdx > movq REGISTER_SAVE_RCX(%rsp), %rcx > movq REGISTER_SAVE_RAX(%rsp), %rax > + # movaps is 1-byte shorter. > + movaps REGISTER_SAVE_XMM0(%rsp), %xmm0 > + movaps REGISTER_SAVE_XMM1(%rsp), %xmm1 > + movaps REGISTER_SAVE_XMM2(%rsp), %xmm2 > + movaps REGISTER_SAVE_XMM3(%rsp), %xmm3 > + movaps REGISTER_SAVE_XMM4(%rsp), %xmm4 > + movaps REGISTER_SAVE_XMM5(%rsp), %xmm5 > + movaps REGISTER_SAVE_XMM6(%rsp), %xmm6 > + movaps REGISTER_SAVE_XMM7(%rsp), %xmm7 > # Adjust stack(PLT did 2 pushes) > addq $(REGISTER_SAVE_AREA + 16), %rsp > cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16)) > -- > 2.4.3 > movaps doesn't work since the stack is misaligned when __tls_get_addr is called due to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index 678c57f..ea81d7d 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -27,26 +27,36 @@ /* Area on stack to save and restore registers used for parameter passing when calling _dl_fixup. */ #ifdef __ILP32__ -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX. */ -# define REGISTER_SAVE_AREA (8 * 7) -# define REGISTER_SAVE_RAX 0 +/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to + XMM7. */ +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 8) +/* Align XMM register save area to 16 bytes. */ +# define REGISTER_SAVE_XMM0 0 # define PRESERVE_BND_REGS_PREFIX #else -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0, - BND1, BND2, BND3. */ -# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4) +/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as + BND0, BND1, BND2, BND3 and XMM0 to XMM7. */ +# define REGISTER_SAVE_AREA (8 * 7 + 16 * 4 + 16 * 8) /* Align bound register save area to 16 bytes. */ # define REGISTER_SAVE_BND0 0 # define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16) # define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16) # define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16) -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16) +# define REGISTER_SAVE_XMM0 (REGISTER_SAVE_BND3 + 16) # ifdef HAVE_MPX_SUPPORT # define PRESERVE_BND_REGS_PREFIX bnd # else # define PRESERVE_BND_REGS_PREFIX .byte 0xf2 # endif #endif +#define REGISTER_SAVE_XMM1 (REGISTER_SAVE_XMM0 + 16) +#define REGISTER_SAVE_XMM2 (REGISTER_SAVE_XMM1 + 16) +#define REGISTER_SAVE_XMM3 (REGISTER_SAVE_XMM2 + 16) +#define REGISTER_SAVE_XMM4 (REGISTER_SAVE_XMM3 + 16) +#define REGISTER_SAVE_XMM5 (REGISTER_SAVE_XMM4 + 16) +#define REGISTER_SAVE_XMM6 (REGISTER_SAVE_XMM5 + 16) +#define REGISTER_SAVE_XMM7 (REGISTER_SAVE_XMM6 + 16) +#define REGISTER_SAVE_RAX (REGISTER_SAVE_XMM7 + 16) #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8) #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8) #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8) @@ -71,6 +81,15 @@ _dl_runtime_resolve: movq %rdi, REGISTER_SAVE_RDI(%rsp) movq %r8, REGISTER_SAVE_R8(%rsp) movq %r9, REGISTER_SAVE_R9(%rsp) + # movaps is 1-byte shorter. + movaps %xmm0, REGISTER_SAVE_XMM0(%rsp) + movaps %xmm1, REGISTER_SAVE_XMM1(%rsp) + movaps %xmm2, REGISTER_SAVE_XMM2(%rsp) + movaps %xmm3, REGISTER_SAVE_XMM3(%rsp) + movaps %xmm4, REGISTER_SAVE_XMM4(%rsp) + movaps %xmm5, REGISTER_SAVE_XMM5(%rsp) + movaps %xmm6, REGISTER_SAVE_XMM6(%rsp) + movaps %xmm7, REGISTER_SAVE_XMM7(%rsp) #ifndef __ILP32__ # We also have to preserve bound registers. These are nops if # Intel MPX isn't available or disabled. @@ -123,6 +142,15 @@ _dl_runtime_resolve: movq REGISTER_SAVE_RDX(%rsp), %rdx movq REGISTER_SAVE_RCX(%rsp), %rcx movq REGISTER_SAVE_RAX(%rsp), %rax + # movaps is 1-byte shorter. + movaps REGISTER_SAVE_XMM0(%rsp), %xmm0 + movaps REGISTER_SAVE_XMM1(%rsp), %xmm1 + movaps REGISTER_SAVE_XMM2(%rsp), %xmm2 + movaps REGISTER_SAVE_XMM3(%rsp), %xmm3 + movaps REGISTER_SAVE_XMM4(%rsp), %xmm4 + movaps REGISTER_SAVE_XMM5(%rsp), %xmm5 + movaps REGISTER_SAVE_XMM6(%rsp), %xmm6 + movaps REGISTER_SAVE_XMM7(%rsp), %xmm7 # Adjust stack(PLT did 2 pushes) addq $(REGISTER_SAVE_AREA + 16), %rsp cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))