Message ID | 20230518170648.93316-1-bugaevc@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v2] Mark more functions as __COLD | expand |
On 18/05/23 14:06, Sergey Bugaev via Libc-alpha wrote: > The various error reporting functions are unlikely to be called; let the > compiler know about this. This will help GCC optimize both these > functions and their callers better. > > In particular, GCC will place the code generated for these functions > into a .text.unlikely section in the object files. The linker will then > group the .text.unlikely sections together inside the .text section of > the resulting binary. This improves code locality. > > In some cases the compiler may even decide that it's beneficial to > separate out the code paths in other functions that lead to a call of a > cold function, and also place them into .text.unlikely section. This > works both within glibc, and in any external code as long as it's > compiled against the new headers containing the __COLD annotations, with > a compiler that can understand and make use of them. > > Signed-off-by: Sergey Bugaev <bugaevc@gmail.com> The rationale seems ok, some comments below. > --- > > Compared to v1: > * got rid of the unintentional _Noreturn vs __attribute__ ((noreturn)) > changes; > * verified that this commit no longer shows up in 'git log -S' output > for either _Noreturn or noreturn. > > assert/assert.h | 4 ++-- > debug/chk_fail.c | 2 +- > elf/dl-exception.c | 2 +- > elf/dl-minimal.c | 8 ++++---- > elf/dl-open.c | 2 +- > elf/dl-tls.c | 2 +- > include/assert.h | 6 +++--- > include/stdio.h | 8 +++++--- > include/sys/cdefs.h | 3 +++ > malloc/dynarray.h | 2 +- > malloc/malloc.c | 3 ++- > misc/bits/error.h | 12 ++++++------ > stdlib/stdlib.h | 2 +- > sysdeps/generic/dl-call_tls_init_tp.h | 2 +- > sysdeps/generic/ldsodefs.h | 23 ++++++++++++----------- > sysdeps/mach/hurd/dl-sysdep.c | 2 +- > 16 files changed, 45 insertions(+), 38 deletions(-) > > diff --git a/assert/assert.h b/assert/assert.h > index 62670e4b..1c472e16 100644 > --- a/assert/assert.h > +++ b/assert/assert.h > @@ -66,12 +66,12 @@ __BEGIN_DECLS > /* This prints an "Assertion failed" message and aborts. */ > extern void __assert_fail (const char *__assertion, const char *__file, > unsigned int __line, const char *__function) > - __THROW __attribute__ ((__noreturn__)); > + __THROW __attribute__ ((__noreturn__)) __COLD; > > /* Likewise, but prints the error text for ERRNUM. */ > extern void __assert_perror_fail (int __errnum, const char *__file, > unsigned int __line, const char *__function) > - __THROW __attribute__ ((__noreturn__)); > + __THROW __attribute__ ((__noreturn__)) __COLD; > > > /* The following is not at all used here but needed for standard > diff --git a/debug/chk_fail.c b/debug/chk_fail.c > index 299e95b6..97037972 100644 > --- a/debug/chk_fail.c > +++ b/debug/chk_fail.c > @@ -22,7 +22,7 @@ > extern char **__libc_argv attribute_hidden; > > void > -__attribute__ ((noreturn)) > +__attribute__ ((noreturn)) __COLD > __chk_fail (void) > { > __fortify_fail ("buffer overflow detected"); > diff --git a/elf/dl-exception.c b/elf/dl-exception.c > index 06a27cd7..1e1309fb 100644 > --- a/elf/dl-exception.c > +++ b/elf/dl-exception.c > @@ -52,7 +52,7 @@ oom_exception (struct dl_exception *exception) > } > > static void > -__attribute__ ((noreturn)) > +__attribute__ ((noreturn)) __COLD > length_mismatch (void) > { > _dl_fatal_printf ("Fatal error: " > diff --git a/elf/dl-minimal.c b/elf/dl-minimal.c > index abda453c..dc267519 100644 > --- a/elf/dl-minimal.c > +++ b/elf/dl-minimal.c > @@ -156,14 +156,14 @@ __strerror_r (int errnum, char *buf, size_t buflen) > return msg; > } > > -void > +void __COLD > __libc_fatal (const char *message) > { > _dl_fatal_printf ("%s", message); > } > rtld_hidden_def (__libc_fatal) > Can't you just add on the function prototype at include/stdio.h? Same question for the __assert_fail and __assert_perror_fail below. > -void > +void __COLD > __attribute__ ((noreturn)) > __chk_fail (void) > { > @@ -176,7 +176,7 @@ rtld_hidden_def (__chk_fail) > If we are linked into the user program (-ldl), the normal __assert_fail > defn can override this one. */ > > -void weak_function > +void weak_function __COLD > __assert_fail (const char *assertion, > const char *file, unsigned int line, const char *function) > { > @@ -190,7 +190,7 @@ Inconsistency detected by ld.so: %s: %u: %s%sAssertion `%s' failed!\n", > rtld_hidden_weak (__assert_fail) > # endif > > -void weak_function > +void weak_function __COLD > __assert_perror_fail (int errnum, > const char *file, unsigned int line, > const char *function) > diff --git a/elf/dl-open.c b/elf/dl-open.c > index 2d985e21..47bc5cac 100644 > --- a/elf/dl-open.c > +++ b/elf/dl-open.c > @@ -77,7 +77,7 @@ struct dl_open_args > }; > > /* Called in case the global scope cannot be extended. */ > -static void __attribute__ ((noreturn)) > +static void __attribute__ ((noreturn)) __COLD > add_to_global_resize_failure (struct link_map *new) > { > _dl_signal_error (ENOMEM, new->l_libname->name, NULL, > diff --git a/elf/dl-tls.c b/elf/dl-tls.c > index 4ef7bc3f..5e198499 100644 > --- a/elf/dl-tls.c > +++ b/elf/dl-tls.c > @@ -113,7 +113,7 @@ _dl_tls_static_surplus_init (size_t naudit) > > /* Out-of-memory handler. */ > static void > -__attribute__ ((__noreturn__)) > +__attribute__ ((__noreturn__)) __COLD > oom (void) > { > _dl_fatal_printf ("cannot allocate memory for thread-local data: ABORT\n"); > diff --git a/include/assert.h b/include/assert.h > index c812808f..9f0a7f8a 100644 > --- a/include/assert.h > +++ b/include/assert.h > @@ -6,19 +6,19 @@ > so it has to be repeated here. */ > extern void __assert_fail (const char *__assertion, const char *__file, > unsigned int __line, const char *__function) > - __THROW __attribute__ ((__noreturn__)); > + __THROW __attribute__ ((__noreturn__)) __COLD; > > /* Likewise, but prints the error text for ERRNUM. */ > extern void __assert_perror_fail (int __errnum, const char *__file, > unsigned int __line, > const char *__function) > - __THROW __attribute__ ((__noreturn__)); > + __THROW __attribute__ ((__noreturn__)) __COLD; > > /* The real implementation of the two functions above. */ > extern void __assert_fail_base (const char *fmt, const char *assertion, > const char *file, unsigned int line, > const char *function) > - __THROW __attribute__ ((__noreturn__)) attribute_hidden; > + __THROW __attribute__ ((__noreturn__)) attribute_hidden __COLD; > > rtld_hidden_proto (__assert_fail) > rtld_hidden_proto (__assert_perror_fail) > diff --git a/include/stdio.h b/include/stdio.h > index da47d1ce..7f4c33e2 100644 > --- a/include/stdio.h > +++ b/include/stdio.h > @@ -171,9 +171,11 @@ extern int __gen_tempname (char *__tmpl, int __suffixlen, int __flags, > /* Print out MESSAGE (which should end with a newline) on the error output > and abort. */ > extern void __libc_fatal (const char *__message) > - __attribute__ ((__noreturn__)); > -_Noreturn void __libc_message (const char *__fnt, ...) attribute_hidden; > -extern void __fortify_fail (const char *msg) __attribute__ ((__noreturn__)); > + __attribute__ ((__noreturn__)) __COLD; > +_Noreturn void __libc_message (const char *__fnt, ...) > + attribute_hidden __COLD; > +extern void __fortify_fail (const char *msg) > + __attribute__ ((__noreturn__)) __COLD; > libc_hidden_proto (__fortify_fail) > > /* Acquire ownership of STREAM. */ > diff --git a/include/sys/cdefs.h b/include/sys/cdefs.h > index 56adb231..ef78edda 100644 > --- a/include/sys/cdefs.h > +++ b/include/sys/cdefs.h > @@ -16,6 +16,9 @@ > # undef __nonnull > # define __nonnull(params) > > +/* Intentionally not marked __COLD in the header, since this only causes GCC > + to create a bunch of useless __foo_chk.cold symbols containing only a call > + to this function; better just keep calling it directly. */ > extern void __chk_fail (void) __attribute__ ((__noreturn__)); > libc_hidden_proto (__chk_fail) > rtld_hidden_proto (__chk_fail) Why exactly gcc generates the useless __foo_chk.cold for this case? Is this a bug or a limitation? > diff --git a/malloc/dynarray.h b/malloc/dynarray.h > index a2d1fd26..5093cc33 100644 > --- a/malloc/dynarray.h > +++ b/malloc/dynarray.h > @@ -165,7 +165,7 @@ bool __libc_dynarray_finalize (struct dynarray_header *list, void *scratch, > /* Internal function. Terminate the process after an index error. > SIZE is the number of elements of the dynamic array. INDEX is the > lookup index which triggered the failure. */ > -_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index); > +_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index) __COLD; > > #ifndef _ISOMAC > libc_hidden_proto (__libc_dynarray_emplace_enlarge) > diff --git a/malloc/malloc.c b/malloc/malloc.c > index 5d8b61d6..6f66813a 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -1093,7 +1093,8 @@ static void* _int_memalign(mstate, size_t, size_t); > static void* _mid_memalign(size_t, size_t, void *); > #endif > > -static void malloc_printerr(const char *str) __attribute__ ((noreturn)); > +static void malloc_printerr(const char *str) > + __attribute__ ((noreturn)) __COLD; > > static void munmap_chunk(mchunkptr p); > #if HAVE_MREMAP > diff --git a/misc/bits/error.h b/misc/bits/error.h > index b3fd5020..46ec0559 100644 > --- a/misc/bits/error.h > +++ b/misc/bits/error.h > @@ -24,16 +24,16 @@ > extern void __REDIRECT (__error_alias, (int __status, int __errnum, > const char *__format, ...), > error) > - __attribute__ ((__format__ (__printf__, 3, 4))); > + __attribute__ ((__format__ (__printf__, 3, 4))) __COLD; > extern void __REDIRECT (__error_noreturn, (int __status, int __errnum, > const char *__format, ...), > error) > - __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4))); > + __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4))) __COLD; > > > /* If we know the function will never return make sure the compiler > realizes that, too. */ > -__extern_always_inline void > +__extern_always_inline __COLD void > error (int __status, int __errnum, const char *__format, ...) > { > if (__builtin_constant_p (__status) && __status != 0) > @@ -48,19 +48,19 @@ extern void __REDIRECT (__error_at_line_alias, (int __status, int __errnum, > unsigned int __line, > const char *__format, ...), > error_at_line) > - __attribute__ ((__format__ (__printf__, 5, 6))); > + __attribute__ ((__format__ (__printf__, 5, 6))) __COLD; > extern void __REDIRECT (__error_at_line_noreturn, (int __status, int __errnum, > const char *__fname, > unsigned int __line, > const char *__format, > ...), > error_at_line) > - __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6))); > + __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6))) __COLD; > > > /* If we know the function will never return make sure the compiler > realizes that, too. */ > -__extern_always_inline void > +__extern_always_inline __COLD void > error_at_line (int __status, int __errnum, const char *__fname, > unsigned int __line, const char *__format, ...) > { > diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h > index 631b0cbb..84afc059 100644 > --- a/stdlib/stdlib.h > +++ b/stdlib/stdlib.h > @@ -727,7 +727,7 @@ extern void *aligned_alloc (size_t __alignment, size_t __size) > #endif > > /* Abort execution and generate a core-dump. */ > -extern void abort (void) __THROW __attribute__ ((__noreturn__)); > +extern void abort (void) __THROW __attribute__ ((__noreturn__)) __COLD; > > > /* Register a function to be called when `exit' is called. */ > diff --git a/sysdeps/generic/dl-call_tls_init_tp.h b/sysdeps/generic/dl-call_tls_init_tp.h > index 208f91e2..34f0959f 100644 > --- a/sysdeps/generic/dl-call_tls_init_tp.h > +++ b/sysdeps/generic/dl-call_tls_init_tp.h > @@ -19,7 +19,7 @@ > #include <startup.h> > #include <tls.h> > > -static inline void > +static inline void __COLD > _startup_fatal_tls_error (void) > { > _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n"); > diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h > index ba531762..a4a0d307 100644 > --- a/sysdeps/generic/ldsodefs.h > +++ b/sysdeps/generic/ldsodefs.h > @@ -785,12 +785,12 @@ void _dl_printf (const char *fmt, ...) > /* Write a message on the specified descriptor standard error. The > parameters are interpreted as for a `printf' call. */ > void _dl_error_printf (const char *fmt, ...) > - attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2))); > + attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2))) __COLD; > > /* Write a message on the specified descriptor standard error and exit > the program. The parameters are interpreted as for a `printf' call. */ > void _dl_fatal_printf (const char *fmt, ...) > - __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__)); > + __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__)) __COLD; > rtld_hidden_proto (_dl_fatal_printf) > > /* An exception raised by the _dl_signal_error function family and > @@ -813,25 +813,25 @@ struct dl_exception > string describing the specific problem. */ > void _dl_exception_create (struct dl_exception *, const char *object, > const char *errstring) > - __attribute__ ((nonnull (1, 3))); > + __attribute__ ((nonnull (1, 3))) __COLD; > rtld_hidden_proto (_dl_exception_create) > > /* Used internally to implement dlerror message freeing. See > include/dlfcn.h and dlfcn/dlerror.c. */ > -void _dl_error_free (void *ptr) attribute_hidden; > +void _dl_error_free (void *ptr) attribute_hidden __COLD; > > /* Like _dl_exception_create, but create errstring from a format > string FMT. Currently, only "%s" and "%%" are supported as format > directives. */ > void _dl_exception_create_format (struct dl_exception *, const char *objname, > const char *fmt, ...) > - __attribute__ ((nonnull (1, 3), format (printf, 3, 4))); > + __attribute__ ((nonnull (1, 3), format (printf, 3, 4))) __COLD; > rtld_hidden_proto (_dl_exception_create_format) > > /* Deallocate the exception, freeing allocated buffers (if > possible). */ > void _dl_exception_free (struct dl_exception *) > - __attribute__ ((nonnull (1))); > + __attribute__ ((nonnull (1))) __COLD; > rtld_hidden_proto (_dl_exception_free) > > /* This function is called by all the internal dynamic linker > @@ -841,13 +841,13 @@ rtld_hidden_proto (_dl_exception_free) > process is terminated immediately. */ > void _dl_signal_exception (int errcode, struct dl_exception *, > const char *occasion) > - __attribute__ ((__noreturn__)); > + __attribute__ ((__noreturn__)) __COLD; > rtld_hidden_proto (_dl_signal_exception) > > /* Like _dl_signal_exception, but creates the exception first. */ > extern void _dl_signal_error (int errcode, const char *object, > const char *occasion, const char *errstring) > - __attribute__ ((__noreturn__)); > + __attribute__ ((__noreturn__)) __COLD; > rtld_hidden_proto (_dl_signal_error) > > /* Like _dl_signal_exception, but may return when called in the > @@ -856,7 +856,8 @@ rtld_hidden_proto (_dl_signal_error) > _dl_signal_exception. */ > #if IS_IN (rtld) > extern void _dl_signal_cexception (int errcode, struct dl_exception *, > - const char *occasion) attribute_hidden; > + const char *occasion) > + attribute_hidden __COLD; > #else > __attribute__ ((always_inline)) > static inline void > @@ -871,7 +872,7 @@ _dl_signal_cexception (int errcode, struct dl_exception *exception, > #if IS_IN (rtld) > extern void _dl_signal_cerror (int errcode, const char *object, > const char *occasion, const char *errstring) > - attribute_hidden; > + attribute_hidden __COLD; > #else > __attribute__ ((always_inline)) > static inline void > @@ -1020,7 +1021,7 @@ extern void _dl_protect_relro (struct link_map *map) attribute_hidden; > PLT is nonzero if this was a PLT reloc; it just affects the message. */ > extern void _dl_reloc_bad_type (struct link_map *map, > unsigned int type, int plt) > - attribute_hidden __attribute__ ((__noreturn__)); > + attribute_hidden __attribute__ ((__noreturn__)) __COLD; > > /* Check the version dependencies of all objects available through > MAP. If VERBOSE print some more diagnostics. */ > diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c > index 79ebb0ce..01a16570 100644 > --- a/sysdeps/mach/hurd/dl-sysdep.c > +++ b/sysdeps/mach/hurd/dl-sysdep.c > @@ -753,7 +753,7 @@ strong_alias (_exit, __GI__exit) > #endif > > check_no_hidden(abort); > -void weak_function > +void weak_function __COLD > abort (void) > { > /* Try to abort using the system specific command. */
On Thu, May 18, 2023 at 10:43 PM Adhemerval Zanella Netto <adhemerval.zanella@linaro.org> wrote: > The rationale seems ok, some comments below. Thanks. Any thoughts on the .text.{startup,exit} part? > > -void > > +void __COLD > > __libc_fatal (const char *message) > > { > > _dl_fatal_printf ("%s", message); > > } > > rtld_hidden_def (__libc_fatal) > > > > Can't you just add on the function prototype at include/stdio.h? Same > question for the __assert_fail and __assert_perror_fail below. But I did just that (added __COLD to the prototypes in include/stdio.h and include/assert.h), didn't I? If you're saying that it's not worth repeating __COLD on the definition, then sure, I could remove that if you prefer. > > +/* Intentionally not marked __COLD in the header, since this only causes GCC > > + to create a bunch of useless __foo_chk.cold symbols containing only a call > > + to this function; better just keep calling it directly. */ > > extern void __chk_fail (void) __attribute__ ((__noreturn__)); > > libc_hidden_proto (__chk_fail) > > rtld_hidden_proto (__chk_fail) > > Why exactly gcc generates the useless __foo_chk.cold for this case? Is this a > bug or a limitation? I don't know; your guess is as good as mine (actually yours would be better than mine). But my guess would be that they just didn't think to add a check that whatever code size savings they're getting by moving the cold path into a separate section outweigh the jump instruction to get there. Here's what I'm getting specifically, on i686-gnu: Dump of assembler code for function __ppoll_chk: Address range 0x198760 to 0x19879e: 0x00198760 <+0>: 56 push %esi 0x00198761 <+1>: 53 push %ebx 0x00198762 <+2>: 83 ec 04 sub $0x4,%esp 0x00198765 <+5>: 8b 44 24 20 mov 0x20(%esp),%eax 0x00198769 <+9>: 8b 54 24 14 mov 0x14(%esp),%edx 0x0019876d <+13>: 8b 4c 24 10 mov 0x10(%esp),%ecx 0x00198771 <+17>: 8b 5c 24 18 mov 0x18(%esp),%ebx 0x00198775 <+21>: c1 e8 03 shr $0x3,%eax 0x00198778 <+24>: 8b 74 24 1c mov 0x1c(%esp),%esi 0x0019877c <+28>: 39 d0 cmp %edx,%eax 0x0019877e <+30>: 0f 82 9d bb e8 ff jb 0x24321 <__ppoll_chk.cold> 0x00198784 <+36>: 89 74 24 1c mov %esi,0x1c(%esp) 0x00198788 <+40>: 89 5c 24 18 mov %ebx,0x18(%esp) 0x0019878c <+44>: 89 54 24 14 mov %edx,0x14(%esp) 0x00198790 <+48>: 89 4c 24 10 mov %ecx,0x10(%esp) 0x00198794 <+52>: 83 c4 04 add $0x4,%esp 0x00198797 <+55>: 5b pop %ebx 0x00198798 <+56>: 5e pop %esi 0x00198799 <+57>: e9 b2 b9 fb ff jmp 0x154150 <__GI_ppoll> Address range 0x24321 to 0x24326: 0x00024321 <-1524799>: e8 5c ff ff ff call 0x24282 <__GI___chk_fail> End of assembler dump. It's spending 6 bytes for the 'jb __ppoll_chk.cold', only to jump to 'call __GI___chk_fail' which takes 5 bytes. That's negative space savings, both overall and inside .text. And actually frankly that's bad codegen altogether, unless I'm missing something. Why not mov 20(%esp), %eax shr $3, %eax cmp 8(%esp), %eax jnb __GI_ppoll push %ebp mov %esp, %ebp call __GI___chk_fail Then maybe it'd make sense to move the "push, mov, call" into .text.unlikely, adding a jmp. Sergey
On 19/05/23 07:35, Sergey Bugaev wrote: > On Thu, May 18, 2023 at 10:43 PM Adhemerval Zanella Netto > <adhemerval.zanella@linaro.org> wrote: >> The rationale seems ok, some comments below. > > Thanks. Any thoughts on the .text.{startup,exit} part? > >>> -void >>> +void __COLD >>> __libc_fatal (const char *message) >>> { >>> _dl_fatal_printf ("%s", message); >>> } >>> rtld_hidden_def (__libc_fatal) >>> >> >> Can't you just add on the function prototype at include/stdio.h? Same >> question for the __assert_fail and __assert_perror_fail below. > > But I did just that (added __COLD to the prototypes in include/stdio.h > and include/assert.h), didn't I? > > If you're saying that it's not worth repeating __COLD on the > definition, then sure, I could remove that if you prefer. The later, specially because for __chk_fail you do add an specific comment. > >>> +/* Intentionally not marked __COLD in the header, since this only causes GCC >>> + to create a bunch of useless __foo_chk.cold symbols containing only a call >>> + to this function; better just keep calling it directly. */ >>> extern void __chk_fail (void) __attribute__ ((__noreturn__)); >>> libc_hidden_proto (__chk_fail) >>> rtld_hidden_proto (__chk_fail) >> >> Why exactly gcc generates the useless __foo_chk.cold for this case? Is this a >> bug or a limitation? > > I don't know; your guess is as good as mine (actually yours would be > better than mine). But my guess would be that they just didn't think > to add a check that whatever code size savings they're getting by > moving the cold path into a separate section outweigh the jump > instruction to get there. > > Here's what I'm getting specifically, on i686-gnu: > > Dump of assembler code for function __ppoll_chk: > Address range 0x198760 to 0x19879e: > 0x00198760 <+0>: 56 push %esi > 0x00198761 <+1>: 53 push %ebx > 0x00198762 <+2>: 83 ec 04 sub $0x4,%esp > 0x00198765 <+5>: 8b 44 24 20 mov 0x20(%esp),%eax > 0x00198769 <+9>: 8b 54 24 14 mov 0x14(%esp),%edx > 0x0019876d <+13>: 8b 4c 24 10 mov 0x10(%esp),%ecx > 0x00198771 <+17>: 8b 5c 24 18 mov 0x18(%esp),%ebx > 0x00198775 <+21>: c1 e8 03 shr $0x3,%eax > 0x00198778 <+24>: 8b 74 24 1c mov 0x1c(%esp),%esi > 0x0019877c <+28>: 39 d0 cmp %edx,%eax > 0x0019877e <+30>: 0f 82 9d bb e8 ff jb 0x24321 <__ppoll_chk.cold> > 0x00198784 <+36>: 89 74 24 1c mov %esi,0x1c(%esp) > 0x00198788 <+40>: 89 5c 24 18 mov %ebx,0x18(%esp) > 0x0019878c <+44>: 89 54 24 14 mov %edx,0x14(%esp) > 0x00198790 <+48>: 89 4c 24 10 mov %ecx,0x10(%esp) > 0x00198794 <+52>: 83 c4 04 add $0x4,%esp > 0x00198797 <+55>: 5b pop %ebx > 0x00198798 <+56>: 5e pop %esi > 0x00198799 <+57>: e9 b2 b9 fb ff jmp 0x154150 <__GI_ppoll> > Address range 0x24321 to 0x24326: > 0x00024321 <-1524799>: e8 5c ff ff ff call 0x24282 <__GI___chk_fail> > End of assembler dump. > > It's spending 6 bytes for the 'jb __ppoll_chk.cold', only to jump to > 'call __GI___chk_fail' which takes 5 bytes. That's negative space > savings, both overall and inside .text. My guess this is arch-specific, since for aarch64-linux I am not seeing any '.cold' sections being generated: 00000000000f4950 <__ppoll_chk>: f4950: eb440c3f cmp x1, x4, lsr #3 f4954: 54000048 b.hi f495c <__ppoll_chk+0xc> // b.pmore f4958: 17ffa1a6 b dcff0 <ppoll> f495c: a9bf7bfd stp x29, x30, [sp, #-16]! f4960: 910003fd mov x29, sp f4964: 97fcdae1 bl 2b4e8 <__chk_fail> f4968: d503201f nop f496c: d503201f nop So I don't have a strong opinion about it. It does seems to generate better code for x86, although not as much for aarch64: With this patch: text data bss dec hex filename 1867381 411832 55080 2334293 239e55 x86_64-linux-gnu-patch/libc.so 2147360 129084 39524 2315968 2356c0 i686-linux-gnu-patch/libc.so 1574355 410624 51704 2036683 1f13cb aarch64-linux-gnu-patch/libc.so With this patch with __COLD for __chk_fail prototype: text data bss dec hex filename 1868824 411832 55080 2335736 23a3f8 x86_64-linux-gnu/libc.so 2149056 129084 39524 2317664 235d60 i686-linux-gnu/libc.so 1574256 410624 51704 2036584 1f1368 aarch64-linux-gnu/libc.so > > And actually frankly that's bad codegen altogether, unless I'm missing > something. Why not > > mov 20(%esp), %eax > shr $3, %eax > cmp 8(%esp), %eax > jnb __GI_ppoll > push %ebp > mov %esp, %ebp > call __GI___chk_fail > > Then maybe it'd make sense to move the "push, mov, call" into > .text.unlikely, adding a jmp. It might be worth to open a bug report on GCC.
diff --git a/assert/assert.h b/assert/assert.h index 62670e4b..1c472e16 100644 --- a/assert/assert.h +++ b/assert/assert.h @@ -66,12 +66,12 @@ __BEGIN_DECLS /* This prints an "Assertion failed" message and aborts. */ extern void __assert_fail (const char *__assertion, const char *__file, unsigned int __line, const char *__function) - __THROW __attribute__ ((__noreturn__)); + __THROW __attribute__ ((__noreturn__)) __COLD; /* Likewise, but prints the error text for ERRNUM. */ extern void __assert_perror_fail (int __errnum, const char *__file, unsigned int __line, const char *__function) - __THROW __attribute__ ((__noreturn__)); + __THROW __attribute__ ((__noreturn__)) __COLD; /* The following is not at all used here but needed for standard diff --git a/debug/chk_fail.c b/debug/chk_fail.c index 299e95b6..97037972 100644 --- a/debug/chk_fail.c +++ b/debug/chk_fail.c @@ -22,7 +22,7 @@ extern char **__libc_argv attribute_hidden; void -__attribute__ ((noreturn)) +__attribute__ ((noreturn)) __COLD __chk_fail (void) { __fortify_fail ("buffer overflow detected"); diff --git a/elf/dl-exception.c b/elf/dl-exception.c index 06a27cd7..1e1309fb 100644 --- a/elf/dl-exception.c +++ b/elf/dl-exception.c @@ -52,7 +52,7 @@ oom_exception (struct dl_exception *exception) } static void -__attribute__ ((noreturn)) +__attribute__ ((noreturn)) __COLD length_mismatch (void) { _dl_fatal_printf ("Fatal error: " diff --git a/elf/dl-minimal.c b/elf/dl-minimal.c index abda453c..dc267519 100644 --- a/elf/dl-minimal.c +++ b/elf/dl-minimal.c @@ -156,14 +156,14 @@ __strerror_r (int errnum, char *buf, size_t buflen) return msg; } -void +void __COLD __libc_fatal (const char *message) { _dl_fatal_printf ("%s", message); } rtld_hidden_def (__libc_fatal) -void +void __COLD __attribute__ ((noreturn)) __chk_fail (void) { @@ -176,7 +176,7 @@ rtld_hidden_def (__chk_fail) If we are linked into the user program (-ldl), the normal __assert_fail defn can override this one. */ -void weak_function +void weak_function __COLD __assert_fail (const char *assertion, const char *file, unsigned int line, const char *function) { @@ -190,7 +190,7 @@ Inconsistency detected by ld.so: %s: %u: %s%sAssertion `%s' failed!\n", rtld_hidden_weak (__assert_fail) # endif -void weak_function +void weak_function __COLD __assert_perror_fail (int errnum, const char *file, unsigned int line, const char *function) diff --git a/elf/dl-open.c b/elf/dl-open.c index 2d985e21..47bc5cac 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -77,7 +77,7 @@ struct dl_open_args }; /* Called in case the global scope cannot be extended. */ -static void __attribute__ ((noreturn)) +static void __attribute__ ((noreturn)) __COLD add_to_global_resize_failure (struct link_map *new) { _dl_signal_error (ENOMEM, new->l_libname->name, NULL, diff --git a/elf/dl-tls.c b/elf/dl-tls.c index 4ef7bc3f..5e198499 100644 --- a/elf/dl-tls.c +++ b/elf/dl-tls.c @@ -113,7 +113,7 @@ _dl_tls_static_surplus_init (size_t naudit) /* Out-of-memory handler. */ static void -__attribute__ ((__noreturn__)) +__attribute__ ((__noreturn__)) __COLD oom (void) { _dl_fatal_printf ("cannot allocate memory for thread-local data: ABORT\n"); diff --git a/include/assert.h b/include/assert.h index c812808f..9f0a7f8a 100644 --- a/include/assert.h +++ b/include/assert.h @@ -6,19 +6,19 @@ so it has to be repeated here. */ extern void __assert_fail (const char *__assertion, const char *__file, unsigned int __line, const char *__function) - __THROW __attribute__ ((__noreturn__)); + __THROW __attribute__ ((__noreturn__)) __COLD; /* Likewise, but prints the error text for ERRNUM. */ extern void __assert_perror_fail (int __errnum, const char *__file, unsigned int __line, const char *__function) - __THROW __attribute__ ((__noreturn__)); + __THROW __attribute__ ((__noreturn__)) __COLD; /* The real implementation of the two functions above. */ extern void __assert_fail_base (const char *fmt, const char *assertion, const char *file, unsigned int line, const char *function) - __THROW __attribute__ ((__noreturn__)) attribute_hidden; + __THROW __attribute__ ((__noreturn__)) attribute_hidden __COLD; rtld_hidden_proto (__assert_fail) rtld_hidden_proto (__assert_perror_fail) diff --git a/include/stdio.h b/include/stdio.h index da47d1ce..7f4c33e2 100644 --- a/include/stdio.h +++ b/include/stdio.h @@ -171,9 +171,11 @@ extern int __gen_tempname (char *__tmpl, int __suffixlen, int __flags, /* Print out MESSAGE (which should end with a newline) on the error output and abort. */ extern void __libc_fatal (const char *__message) - __attribute__ ((__noreturn__)); -_Noreturn void __libc_message (const char *__fnt, ...) attribute_hidden; -extern void __fortify_fail (const char *msg) __attribute__ ((__noreturn__)); + __attribute__ ((__noreturn__)) __COLD; +_Noreturn void __libc_message (const char *__fnt, ...) + attribute_hidden __COLD; +extern void __fortify_fail (const char *msg) + __attribute__ ((__noreturn__)) __COLD; libc_hidden_proto (__fortify_fail) /* Acquire ownership of STREAM. */ diff --git a/include/sys/cdefs.h b/include/sys/cdefs.h index 56adb231..ef78edda 100644 --- a/include/sys/cdefs.h +++ b/include/sys/cdefs.h @@ -16,6 +16,9 @@ # undef __nonnull # define __nonnull(params) +/* Intentionally not marked __COLD in the header, since this only causes GCC + to create a bunch of useless __foo_chk.cold symbols containing only a call + to this function; better just keep calling it directly. */ extern void __chk_fail (void) __attribute__ ((__noreturn__)); libc_hidden_proto (__chk_fail) rtld_hidden_proto (__chk_fail) diff --git a/malloc/dynarray.h b/malloc/dynarray.h index a2d1fd26..5093cc33 100644 --- a/malloc/dynarray.h +++ b/malloc/dynarray.h @@ -165,7 +165,7 @@ bool __libc_dynarray_finalize (struct dynarray_header *list, void *scratch, /* Internal function. Terminate the process after an index error. SIZE is the number of elements of the dynamic array. INDEX is the lookup index which triggered the failure. */ -_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index); +_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index) __COLD; #ifndef _ISOMAC libc_hidden_proto (__libc_dynarray_emplace_enlarge) diff --git a/malloc/malloc.c b/malloc/malloc.c index 5d8b61d6..6f66813a 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1093,7 +1093,8 @@ static void* _int_memalign(mstate, size_t, size_t); static void* _mid_memalign(size_t, size_t, void *); #endif -static void malloc_printerr(const char *str) __attribute__ ((noreturn)); +static void malloc_printerr(const char *str) + __attribute__ ((noreturn)) __COLD; static void munmap_chunk(mchunkptr p); #if HAVE_MREMAP diff --git a/misc/bits/error.h b/misc/bits/error.h index b3fd5020..46ec0559 100644 --- a/misc/bits/error.h +++ b/misc/bits/error.h @@ -24,16 +24,16 @@ extern void __REDIRECT (__error_alias, (int __status, int __errnum, const char *__format, ...), error) - __attribute__ ((__format__ (__printf__, 3, 4))); + __attribute__ ((__format__ (__printf__, 3, 4))) __COLD; extern void __REDIRECT (__error_noreturn, (int __status, int __errnum, const char *__format, ...), error) - __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4))); + __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4))) __COLD; /* If we know the function will never return make sure the compiler realizes that, too. */ -__extern_always_inline void +__extern_always_inline __COLD void error (int __status, int __errnum, const char *__format, ...) { if (__builtin_constant_p (__status) && __status != 0) @@ -48,19 +48,19 @@ extern void __REDIRECT (__error_at_line_alias, (int __status, int __errnum, unsigned int __line, const char *__format, ...), error_at_line) - __attribute__ ((__format__ (__printf__, 5, 6))); + __attribute__ ((__format__ (__printf__, 5, 6))) __COLD; extern void __REDIRECT (__error_at_line_noreturn, (int __status, int __errnum, const char *__fname, unsigned int __line, const char *__format, ...), error_at_line) - __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6))); + __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6))) __COLD; /* If we know the function will never return make sure the compiler realizes that, too. */ -__extern_always_inline void +__extern_always_inline __COLD void error_at_line (int __status, int __errnum, const char *__fname, unsigned int __line, const char *__format, ...) { diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h index 631b0cbb..84afc059 100644 --- a/stdlib/stdlib.h +++ b/stdlib/stdlib.h @@ -727,7 +727,7 @@ extern void *aligned_alloc (size_t __alignment, size_t __size) #endif /* Abort execution and generate a core-dump. */ -extern void abort (void) __THROW __attribute__ ((__noreturn__)); +extern void abort (void) __THROW __attribute__ ((__noreturn__)) __COLD; /* Register a function to be called when `exit' is called. */ diff --git a/sysdeps/generic/dl-call_tls_init_tp.h b/sysdeps/generic/dl-call_tls_init_tp.h index 208f91e2..34f0959f 100644 --- a/sysdeps/generic/dl-call_tls_init_tp.h +++ b/sysdeps/generic/dl-call_tls_init_tp.h @@ -19,7 +19,7 @@ #include <startup.h> #include <tls.h> -static inline void +static inline void __COLD _startup_fatal_tls_error (void) { _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n"); diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index ba531762..a4a0d307 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -785,12 +785,12 @@ void _dl_printf (const char *fmt, ...) /* Write a message on the specified descriptor standard error. The parameters are interpreted as for a `printf' call. */ void _dl_error_printf (const char *fmt, ...) - attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2))); + attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2))) __COLD; /* Write a message on the specified descriptor standard error and exit the program. The parameters are interpreted as for a `printf' call. */ void _dl_fatal_printf (const char *fmt, ...) - __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__)); + __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__)) __COLD; rtld_hidden_proto (_dl_fatal_printf) /* An exception raised by the _dl_signal_error function family and @@ -813,25 +813,25 @@ struct dl_exception string describing the specific problem. */ void _dl_exception_create (struct dl_exception *, const char *object, const char *errstring) - __attribute__ ((nonnull (1, 3))); + __attribute__ ((nonnull (1, 3))) __COLD; rtld_hidden_proto (_dl_exception_create) /* Used internally to implement dlerror message freeing. See include/dlfcn.h and dlfcn/dlerror.c. */ -void _dl_error_free (void *ptr) attribute_hidden; +void _dl_error_free (void *ptr) attribute_hidden __COLD; /* Like _dl_exception_create, but create errstring from a format string FMT. Currently, only "%s" and "%%" are supported as format directives. */ void _dl_exception_create_format (struct dl_exception *, const char *objname, const char *fmt, ...) - __attribute__ ((nonnull (1, 3), format (printf, 3, 4))); + __attribute__ ((nonnull (1, 3), format (printf, 3, 4))) __COLD; rtld_hidden_proto (_dl_exception_create_format) /* Deallocate the exception, freeing allocated buffers (if possible). */ void _dl_exception_free (struct dl_exception *) - __attribute__ ((nonnull (1))); + __attribute__ ((nonnull (1))) __COLD; rtld_hidden_proto (_dl_exception_free) /* This function is called by all the internal dynamic linker @@ -841,13 +841,13 @@ rtld_hidden_proto (_dl_exception_free) process is terminated immediately. */ void _dl_signal_exception (int errcode, struct dl_exception *, const char *occasion) - __attribute__ ((__noreturn__)); + __attribute__ ((__noreturn__)) __COLD; rtld_hidden_proto (_dl_signal_exception) /* Like _dl_signal_exception, but creates the exception first. */ extern void _dl_signal_error (int errcode, const char *object, const char *occasion, const char *errstring) - __attribute__ ((__noreturn__)); + __attribute__ ((__noreturn__)) __COLD; rtld_hidden_proto (_dl_signal_error) /* Like _dl_signal_exception, but may return when called in the @@ -856,7 +856,8 @@ rtld_hidden_proto (_dl_signal_error) _dl_signal_exception. */ #if IS_IN (rtld) extern void _dl_signal_cexception (int errcode, struct dl_exception *, - const char *occasion) attribute_hidden; + const char *occasion) + attribute_hidden __COLD; #else __attribute__ ((always_inline)) static inline void @@ -871,7 +872,7 @@ _dl_signal_cexception (int errcode, struct dl_exception *exception, #if IS_IN (rtld) extern void _dl_signal_cerror (int errcode, const char *object, const char *occasion, const char *errstring) - attribute_hidden; + attribute_hidden __COLD; #else __attribute__ ((always_inline)) static inline void @@ -1020,7 +1021,7 @@ extern void _dl_protect_relro (struct link_map *map) attribute_hidden; PLT is nonzero if this was a PLT reloc; it just affects the message. */ extern void _dl_reloc_bad_type (struct link_map *map, unsigned int type, int plt) - attribute_hidden __attribute__ ((__noreturn__)); + attribute_hidden __attribute__ ((__noreturn__)) __COLD; /* Check the version dependencies of all objects available through MAP. If VERBOSE print some more diagnostics. */ diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c index 79ebb0ce..01a16570 100644 --- a/sysdeps/mach/hurd/dl-sysdep.c +++ b/sysdeps/mach/hurd/dl-sysdep.c @@ -753,7 +753,7 @@ strong_alias (_exit, __GI__exit) #endif check_no_hidden(abort); -void weak_function +void weak_function __COLD abort (void) { /* Try to abort using the system specific command. */
The various error reporting functions are unlikely to be called; let the compiler know about this. This will help GCC optimize both these functions and their callers better. In particular, GCC will place the code generated for these functions into a .text.unlikely section in the object files. The linker will then group the .text.unlikely sections together inside the .text section of the resulting binary. This improves code locality. In some cases the compiler may even decide that it's beneficial to separate out the code paths in other functions that lead to a call of a cold function, and also place them into .text.unlikely section. This works both within glibc, and in any external code as long as it's compiled against the new headers containing the __COLD annotations, with a compiler that can understand and make use of them. Signed-off-by: Sergey Bugaev <bugaevc@gmail.com> --- Compared to v1: * got rid of the unintentional _Noreturn vs __attribute__ ((noreturn)) changes; * verified that this commit no longer shows up in 'git log -S' output for either _Noreturn or noreturn. assert/assert.h | 4 ++-- debug/chk_fail.c | 2 +- elf/dl-exception.c | 2 +- elf/dl-minimal.c | 8 ++++---- elf/dl-open.c | 2 +- elf/dl-tls.c | 2 +- include/assert.h | 6 +++--- include/stdio.h | 8 +++++--- include/sys/cdefs.h | 3 +++ malloc/dynarray.h | 2 +- malloc/malloc.c | 3 ++- misc/bits/error.h | 12 ++++++------ stdlib/stdlib.h | 2 +- sysdeps/generic/dl-call_tls_init_tp.h | 2 +- sysdeps/generic/ldsodefs.h | 23 ++++++++++++----------- sysdeps/mach/hurd/dl-sysdep.c | 2 +- 16 files changed, 45 insertions(+), 38 deletions(-)