Message ID | PAWPR08MB89829B5E839D4A08FB5AA2EA83D82@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | malloc: Improve performance of __libc_malloc | expand |
Hi Wilco, Just wondering why you choose to set __attribute_noinline__ to __libc_malloc2 ? I know it is tailcalled, but having a single caller, would it not make sense to inline? Anything else and for what is worth, all seems Ok. ;-) Cheers, Cupertino On 20-03-2025 21:11, Wilco Dijkstra wrote: > > Improve performance of __libc_malloc by splitting it into 2 parts: first handle > the tcache fastpath, then do the rest in a separate tailcalled function. > This results in significant performance gains since __libc_malloc doesn't need > to setup a frame and we delay tcache initialization and setting of errno until > later. > > On Neoverse V2, bench-malloc-simple improves by 6.7% overall (up to 8.5% for > ST case) and bench-malloc-thread improves by 19.4% for 1 thread and 14.2% for > 32 threads. > > Passes regress, OK for commit? > > --- > > diff --git a/malloc/malloc.c b/malloc/malloc.c > index 06b508e22b14756d4aee8c6f62f0781557c7a96f..10526b58ed45d7733c8de9c70bc6ced300fedb05 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -1325,6 +1325,9 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ > static __always_inline size_t > checked_request2size (size_t req) __nonnull (1) > { > + _Static_assert (PTRDIFF_MAX <= SIZE_MAX / 2, > + "PTRDIFF_MAX is not more than half of SIZE_MAX"); > + > if (__glibc_unlikely (req > PTRDIFF_MAX)) > return 0; > > @@ -3380,26 +3383,17 @@ tcache_thread_shutdown (void) > #endif /* !USE_TCACHE */ > > #if IS_IN (libc) > -void * > -__libc_malloc (size_t bytes) > + > +static void * __attribute_noinline__ > +__libc_malloc2 (size_t bytes) > { > mstate ar_ptr; > void *victim; > > - _Static_assert (PTRDIFF_MAX <= SIZE_MAX / 2, > - "PTRDIFF_MAX is not more than half of SIZE_MAX"); > - > if (!__malloc_initialized) > ptmalloc_init (); > -#if USE_TCACHE > - bool err = tcache_try_malloc (bytes, &victim); > - > - if (err) > - return NULL; > > - if (victim) > - return tag_new_usable (victim); > -#endif > + MAYBE_INIT_TCACHE (); > > if (SINGLE_THREAD_P) > { > @@ -3430,6 +3424,19 @@ __libc_malloc (size_t bytes) > ar_ptr == arena_for_chunk (mem2chunk (victim))); > return victim; > } > + > +void * > +__libc_malloc (size_t bytes) > +{ > +#if USE_TCACHE > + size_t tc_idx = csize2tidx (checked_request2size (bytes)); > + > + if (tcache_available (tc_idx)) > + return tcache_get (tc_idx); > +#endif > + > + return __libc_malloc2 (bytes); > +} > libc_hidden_def (__libc_malloc) > > void > > >
Hi Cupertino, > Just wondering why you choose to set __attribute_noinline__ to > __libc_malloc2 ? > I know it is tailcalled, but having a single caller, would it not make > sense to inline? The tailcall means it ends up being a leaf function and thus doesn't need a frame. If you inline it, you execute a prolog and epilog even if you just use the tcache fastpath. Cheers, Wilco
diff --git a/malloc/malloc.c b/malloc/malloc.c index 06b508e22b14756d4aee8c6f62f0781557c7a96f..10526b58ed45d7733c8de9c70bc6ced300fedb05 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -1325,6 +1325,9 @@ nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ static __always_inline size_t checked_request2size (size_t req) __nonnull (1) { + _Static_assert (PTRDIFF_MAX <= SIZE_MAX / 2, + "PTRDIFF_MAX is not more than half of SIZE_MAX"); + if (__glibc_unlikely (req > PTRDIFF_MAX)) return 0; @@ -3380,26 +3383,17 @@ tcache_thread_shutdown (void) #endif /* !USE_TCACHE */ #if IS_IN (libc) -void * -__libc_malloc (size_t bytes) + +static void * __attribute_noinline__ +__libc_malloc2 (size_t bytes) { mstate ar_ptr; void *victim; - _Static_assert (PTRDIFF_MAX <= SIZE_MAX / 2, - "PTRDIFF_MAX is not more than half of SIZE_MAX"); - if (!__malloc_initialized) ptmalloc_init (); -#if USE_TCACHE - bool err = tcache_try_malloc (bytes, &victim); - - if (err) - return NULL; - if (victim) - return tag_new_usable (victim); -#endif + MAYBE_INIT_TCACHE (); if (SINGLE_THREAD_P) { @@ -3430,6 +3424,19 @@ __libc_malloc (size_t bytes) ar_ptr == arena_for_chunk (mem2chunk (victim))); return victim; } + +void * +__libc_malloc (size_t bytes) +{ +#if USE_TCACHE + size_t tc_idx = csize2tidx (checked_request2size (bytes)); + + if (tcache_available (tc_idx)) + return tcache_get (tc_idx); +#endif + + return __libc_malloc2 (bytes); +} libc_hidden_def (__libc_malloc) void