@@ -3755,16 +3755,55 @@ __libc_pvalloc (size_t bytes)
return _mid_memalign (pagesize, rounded_bytes, address);
}
+static __always_inline void *
+clear_mem (void *mem, INTERNAL_SIZE_T csz)
+{
+ INTERNAL_SIZE_T *d;
+ unsigned long clearsize, nclears;
+
+ /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
+ contents have an odd number of INTERNAL_SIZE_T-sized words;
+ minimally 3. */
+ d = (INTERNAL_SIZE_T *) mem;
+ clearsize = csz - SIZE_SZ;
+ nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+ assert (nclears >= 3);
+
+ if (nclears > 9)
+ return memset (d, 0, clearsize);
+
+ else
+ {
+ *(d + 0) = 0;
+ *(d + 1) = 0;
+ *(d + 2) = 0;
+ if (nclears > 4)
+ {
+ *(d + 3) = 0;
+ *(d + 4) = 0;
+ if (nclears > 6)
+ {
+ *(d + 5) = 0;
+ *(d + 6) = 0;
+ if (nclears > 8)
+ {
+ *(d + 7) = 0;
+ *(d + 8) = 0;
+ }
+ }
+ }
+ }
+
+ return mem;
+}
+
void *
__libc_calloc (size_t n, size_t elem_size)
{
mstate av;
- mchunkptr oldtop;
- INTERNAL_SIZE_T sz, oldtopsize;
+ mchunkptr oldtop, p;
+ INTERNAL_SIZE_T sz, oldtopsize, csz;
void *mem;
- unsigned long clearsize;
- unsigned long nclears;
- INTERNAL_SIZE_T *d;
ptrdiff_t bytes;
if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3780,6 +3819,29 @@ __libc_calloc (size_t n, size_t elem_size)
MAYBE_INIT_TCACHE ();
+#if USE_TCACHE
+ /* int_free also calls request2size, be careful to not pad twice. */
+ size_t tbytes = checked_request2size (bytes);
+ if (tbytes == 0)
+ {
+ __set_errno (ENOMEM);
+ return NULL;
+ }
+ size_t tc_idx = csize2tidx (tbytes);
+
+ if (tc_idx < mp_.tcache_bins
+ && tcache != NULL
+ && tcache->counts[tc_idx] > 0)
+ {
+ mem = tcache_get (tc_idx);
+ p = mem2chunk (mem);
+ if (__glibc_unlikely (mtag_enabled))
+ return tag_new_zero_region (mem, memsize (p));
+ csz = chunksize (p);
+ return clear_mem (mem, csz);
+ }
+#endif
+
if (SINGLE_THREAD_P)
av = &main_arena;
else
@@ -3834,7 +3896,7 @@ __libc_calloc (size_t n, size_t elem_size)
if (mem == 0)
return 0;
- mchunkptr p = mem2chunk (mem);
+ p = mem2chunk (mem);
/* If we are using memory tagging, then we need to set the tags
regardless of MORECORE_CLEARS, so we zero the whole block while
@@ -3842,7 +3904,7 @@ __libc_calloc (size_t n, size_t elem_size)
if (__glibc_unlikely (mtag_enabled))
return tag_new_zero_region (mem, memsize (p));
- INTERNAL_SIZE_T csz = chunksize (p);
+ csz = chunksize (p);
/* Two optional cases in which clearing not necessary */
if (chunk_is_mmapped (p))
@@ -3861,40 +3923,7 @@ __libc_calloc (size_t n, size_t elem_size)
}
#endif
- /* Unroll clear of <= 36 bytes (72 if 8byte sizes). We know that
- contents have an odd number of INTERNAL_SIZE_T-sized words;
- minimally 3. */
- d = (INTERNAL_SIZE_T *) mem;
- clearsize = csz - SIZE_SZ;
- nclears = clearsize / sizeof (INTERNAL_SIZE_T);
- assert (nclears >= 3);
-
- if (nclears > 9)
- return memset (d, 0, clearsize);
-
- else
- {
- *(d + 0) = 0;
- *(d + 1) = 0;
- *(d + 2) = 0;
- if (nclears > 4)
- {
- *(d + 3) = 0;
- *(d + 4) = 0;
- if (nclears > 6)
- {
- *(d + 5) = 0;
- *(d + 6) = 0;
- if (nclears > 8)
- {
- *(d + 7) = 0;
- *(d + 8) = 0;
- }
- }
- }
- }
-
- return mem;
+ return clear_mem (mem, csz);
}
#endif /* IS_IN (libc) */
This commit add tcache support in calloc() which can largely improve the performance of small size allocation, especially in multi-thread scenario. clear_mem() is also split out as a helper function for better reusing the code. Result of bench-malloc-thread benchmark Test Platform: Xeon-8380 Bench Function: calloc Ratio: New / Original time_per_iteration (Lower is Better) Threads# | Ratio -----------|------ 1 thread | 0.724 4 threads | 0.534 Signed-off-by: Wangyang Guo <wangyang.guo@intel.com> --- malloc/malloc.c | 111 ++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 41 deletions(-)