Message ID | Ye1NJQwEpe7AyNYD@squeak.grove.modra.org |
---|---|
State | New |
Headers | show |
Series | [PowerPC64] Use medium model toc accesses throughout | expand |
On 1/23/22 6:42 AM, Alan Modra via Libc-alpha wrote: > The PowerPC64 linker edits medium model toc-indirect code to toc-pointer > relative: > addis r9,r2,tc_entry_for_var@toc@ha > ld r9,tc_entry_for_var@toc@l(r9) > becomes > addis r9,r2,(var-.TOC.)@ha > addi r9,r9,(var-.TOC.)@l > when "var" is known to be local to the binary. This isn't done for > small-model toc-indirect code, because "var" is almost guaranteed to > be too far away from .TOC. for a 16-bit signed offset. And, because > the analysis of which .toc entry can be removed becomes much more > complicated in objects that mix code models, they aren't removed if > any small-model toc sequence appears in an object file. > > Unfortunately glibc's build of ld.so smashes the needed objects > together in a ld -r linking stage. This means the GOT/TOC is left > with a whole lot of relative relocations which is untidy, but in > itself is not a serious problem. However, static-pie on powerpc64 > bombs due to a segfault caused by one of the small-model accesses > before _dl_relocate_static_pie. (The very first one in rcrt1.o > passing start_addresses in r8 to __libc_start_main.) > > So this patch makes all the toc/got accesses in assembly medium code > model, and a couple of functions hidden. By itself this is *not* > enough to give us working static-pie, but it is useful anyway to > enable better linker optimisation. > > There's a serious problem in libgcc too. libgcc ifuncs access the > AT_HWCAP words stored in the tcb with an offset from the thread > pointer (r13), but r13 isn't set at the time _dl_relocate_static_pie > runs, and I'm loathe to try calling init_tls early. A better approach > that might work is to fake r13 so that _dl_hwcap is at the expected > offset where we'd normally find the tcb hwcap words. > > Tested for regressions with a powerpc64le-linux build and test run. > OK to apply? > > diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S > index 23debc2faf..45b821607b 100644 > --- a/sysdeps/powerpc/powerpc64/dl-trampoline.S > +++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S > @@ -32,6 +32,7 @@ > because gcc as of 2010/05 doesn't allocate a proper stack frame for > a function that makes no calls except for __tls_get_addr and we > might be here resolving the __tls_get_addr call. */ > + .hidden _dl_runtime_resolve > #define INT_PARMS FRAME_MIN_SIZE > ENTRY (_dl_runtime_resolve, 4) > stdu r1,-FRAME_SIZE(r1) > @@ -195,6 +196,7 @@ END(_dl_runtime_resolve) > parm1 (r3) and the index (r0) needs to be converted to an offset > (index * 24) in parm2 (r4). */ > #ifndef PROF > + .hidden _dl_profile_resolve > ENTRY (_dl_profile_resolve, 4) > /* Spill r30, r31 to preserve the link_map* and reloc_addr, in case we > need to call _dl_audit_pltexit. */ LGTM (this should wait until after the freeze), though these two changes seem orthogonal to the commit title. For my education, why are are small model accesses used in these files?
On Fri, Jan 28, 2022 at 11:48:44AM -0600, Paul E Murphy wrote: > > --- a/sysdeps/powerpc/powerpc64/dl-trampoline.S > > +++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S > > @@ -32,6 +32,7 @@ > > because gcc as of 2010/05 doesn't allocate a proper stack frame for > > a function that makes no calls except for __tls_get_addr and we > > might be here resolving the __tls_get_addr call. */ > > + .hidden _dl_runtime_resolve > > #define INT_PARMS FRAME_MIN_SIZE > > ENTRY (_dl_runtime_resolve, 4) > > stdu r1,-FRAME_SIZE(r1) > > @@ -195,6 +196,7 @@ END(_dl_runtime_resolve) > > parm1 (r3) and the index (r0) needs to be converted to an offset > > (index * 24) in parm2 (r4). */ > > #ifndef PROF > > + .hidden _dl_profile_resolve > > ENTRY (_dl_profile_resolve, 4) > > /* Spill r30, r31 to preserve the link_map* and reloc_addr, in case we > > need to call _dl_audit_pltexit. */ > > LGTM (this should wait until after the freeze), though these two changes > seem orthogonal to the commit title. True. It could have been a separate patch. > For my education, why are are small model accesses used in these files? dl-trampoline.S is an old file, from before the linker edited toc-indirect code to toc-pointer relative. Without the link editing small model is more efficient if the toc is small enough.
diff --git a/sysdeps/powerpc/powerpc64/__longjmp-common.S b/sysdeps/powerpc/powerpc64/__longjmp-common.S index 4d71b9e102..5f629e1e0f 100644 --- a/sysdeps/powerpc/powerpc64/__longjmp-common.S +++ b/sysdeps/powerpc/powerpc64/__longjmp-common.S @@ -47,12 +47,14 @@ ENTRY (__longjmp) CALL_MCOUNT 2 #ifndef __NO_VMX__ - ld r5,.LC__dl_hwcap@toc(r2) + addis r5,r2,.LC__dl_hwcap@toc@ha + ld r5,.LC__dl_hwcap@toc@l(r5) # ifdef SHARED /* Load _rtld-global._dl_hwcap. */ - ld r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r5) + ld r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r5) # else - ld r5,0(r5) /* Load extern _dl_hwcap. */ + /* Load extern _dl_hwcap. */ + ld r5,0(r5) # endif andis. r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16) beq L(no_vmx) diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h index a505998351..eb41c85280 100644 --- a/sysdeps/powerpc/powerpc64/dl-machine.h +++ b/sysdeps/powerpc/powerpc64/dl-machine.h @@ -175,9 +175,12 @@ BODY_PREFIX "_dl_start_user:\n" \ /* the address of _start in r30. */ \ " mr 30,3\n" \ /* &_dl_argc in 29, &_dl_argv in 27, and _dl_loaded in 28. */ \ -" ld 28,.LC__rtld_local@toc(2)\n" \ -" ld 29,.LC__dl_argc@toc(2)\n" \ -" ld 27,.LC__dl_argv@toc(2)\n" \ +" addis 28,2,.LC__rtld_local@toc@ha\n" \ +" ld 28,.LC__rtld_local@toc@l(28)\n" \ +" addis 29,2,.LC__dl_argc@toc@ha\n" \ +" ld 29,.LC__dl_argc@toc@l(29)\n" \ +" addis 27,2,.LC__dl_argv@toc@ha\n" \ +" ld 27,.LC__dl_argv@toc@l(27)\n" \ /* _dl_init (_dl_loaded, _dl_argc, _dl_argv, _dl_argv+_dl_argc+1). */ \ " ld 3,0(28)\n" \ " lwa 4,0(29)\n" \ @@ -204,7 +207,8 @@ BODY_PREFIX "_dl_start_user:\n" \ " addi 6,6,8\n" \ /* Pass a termination function pointer (in this case _dl_fini) in \ r7. */ \ -" ld 7,.LC__dl_fini@toc(2)\n" \ +" addis 7,2,.LC__dl_fini@toc@ha\n" \ +" ld 7,.LC__dl_fini@toc@l(7)\n" \ /* Pass the stack pointer in r1 (so far so good), pointing to a NULL \ value. This lets our startup code distinguish between a program \ linked statically, which linux will call with argc on top of the \ diff --git a/sysdeps/powerpc/powerpc64/dl-trampoline.S b/sysdeps/powerpc/powerpc64/dl-trampoline.S index 23debc2faf..45b821607b 100644 --- a/sysdeps/powerpc/powerpc64/dl-trampoline.S +++ b/sysdeps/powerpc/powerpc64/dl-trampoline.S @@ -32,6 +32,7 @@ because gcc as of 2010/05 doesn't allocate a proper stack frame for a function that makes no calls except for __tls_get_addr and we might be here resolving the __tls_get_addr call. */ + .hidden _dl_runtime_resolve #define INT_PARMS FRAME_MIN_SIZE ENTRY (_dl_runtime_resolve, 4) stdu r1,-FRAME_SIZE(r1) @@ -195,6 +196,7 @@ END(_dl_runtime_resolve) parm1 (r3) and the index (r0) needs to be converted to an offset (index * 24) in parm2 (r4). */ #ifndef PROF + .hidden _dl_profile_resolve ENTRY (_dl_profile_resolve, 4) /* Spill r30, r31 to preserve the link_map* and reloc_addr, in case we need to call _dl_audit_pltexit. */ @@ -225,12 +227,14 @@ ENTRY (_dl_profile_resolve, 4) std r9,INT_PARMS+48(r1) std r10,INT_PARMS+56(r1) std r8,CALLING_SP(r1) - ld r12,.LC__dl_hwcap@toc(r2) + addis r12,r2,.LC__dl_hwcap@toc@ha + ld r12,.LC__dl_hwcap@toc@l(r12) #ifdef SHARED /* Load _rtld_local_ro._dl_hwcap. */ ld r12,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r12) #else - ld r12,0(r12) /* Load extern _dl_hwcap. */ + /* Load extern _dl_hwcap. */ + ld r12,0(r12) #endif andis. r0,r12,(PPC_FEATURE_HAS_ALTIVEC >> 16) beq L(saveFP) diff --git a/sysdeps/powerpc/powerpc64/setjmp-common.S b/sysdeps/powerpc/powerpc64/setjmp-common.S index 41812e3427..19e76d59ee 100644 --- a/sysdeps/powerpc/powerpc64/setjmp-common.S +++ b/sysdeps/powerpc/powerpc64/setjmp-common.S @@ -132,12 +132,14 @@ JUMPTARGET(GLUE(__sigsetjmp_symbol,_ent)): std r31,((JB_GPRS+17)*8)(3) stfd fp31,((JB_FPRS+17)*8)(3) #ifndef __NO_VMX__ - ld r6,.LC__dl_hwcap@toc(r2) + addis r6,r2,.LC__dl_hwcap@toc@ha + ld r6,.LC__dl_hwcap@toc@l(r6) # ifdef SHARED /* Load _rtld-global._dl_hwcap. */ - ld r6,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r6) + ld r6,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET(r6) # else - ld r6,0(r6) /* Load extern _dl_hwcap. */ + /* Load extern _dl_hwcap. */ + ld r6,0(r6) # endif andis. r6,r6,(PPC_FEATURE_HAS_ALTIVEC >> 16) beq L(no_vmx) diff --git a/sysdeps/powerpc/powerpc64/start.S b/sysdeps/powerpc/powerpc64/start.S index 4319dc8d3e..244d9da07b 100644 --- a/sysdeps/powerpc/powerpc64/start.S +++ b/sysdeps/powerpc/powerpc64/start.S @@ -74,7 +74,8 @@ ENTRY (_start) /* put the address of start_addresses in r8... ** ** PPC64 ABI uses R13 for thread local, so we leave it alone */ - ld r8,.L01@toc(r2) + addis r8,r2,.L01@toc@ha + ld r8,.L01@toc@l(r8) /* and continue in libc-start, in glibc. */ b JUMPTARGET(__libc_start_main) diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h index 3fec06e0df..011068b290 100644 --- a/sysdeps/powerpc/powerpc64/sysdep.h +++ b/sysdeps/powerpc/powerpc64/sysdep.h @@ -469,14 +469,16 @@ LT_LABELSUFFIX(name,_name_end): ; \ .tc _rtld_global_ro[TC],_rtld_global_ro # endif # define __GLRO(rOUT, var, offset) \ - ld rOUT,.LC__ ## var@toc(r2); \ + addis rOUT,r2,.LC__ ## var@toc@ha; \ + ld rOUT,.LC__ ## var@toc@l(rOUT); \ lwz rOUT,offset(rOUT) #else # define __GLRO_DEF(var) \ .LC__ ## var: \ .tc _ ## var[TC],_ ## var # define __GLRO(rOUT, var, offset) \ - ld rOUT,.LC__ ## var@toc(r2); \ + addis rOUT,r2,.LC__ ## var@toc@ha; \ + ld rOUT,.LC__ ## var@toc@l(rOUT); \ lwz rOUT,0(rOUT) #endif