diff mbox series

[5/6] malloc: Add tcache path for calloc

Message ID 20240822025921.3120998-6-wangyang.guo@intel.com
State New
Headers show
Series [1/6] malloc: Split _int_free() into 3 sub functions | expand

Commit Message

Guo, Wangyang Aug. 22, 2024, 2:59 a.m. UTC
This commit add tcache support in calloc() which can largely improve
the performance of small size allocation, especially in multi-thread
scenario. clear_mem() is also split out as a helper function for better
reusing the code.

Result of bench-malloc-thread benchmark

Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)

Threads#   | Ratio
-----------|------
1 thread   | 0.724
4 threads  | 0.534

Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
---
 malloc/malloc.c | 111 ++++++++++++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 41 deletions(-)
diff mbox series

Patch

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 030aff093b..19fdd72444 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3755,16 +3755,55 @@  __libc_pvalloc (size_t bytes)
   return _mid_memalign (pagesize, rounded_bytes, address);
 }
 
+static __always_inline void *
+clear_mem (void *mem, INTERNAL_SIZE_T csz)
+{
+  INTERNAL_SIZE_T *d;
+  unsigned long clearsize, nclears;
+
+  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
+     contents have an odd number of INTERNAL_SIZE_T-sized words;
+     minimally 3.  */
+  d = (INTERNAL_SIZE_T *) mem;
+  clearsize = csz - SIZE_SZ;
+  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
+  assert (nclears >= 3);
+
+  if (nclears > 9)
+    return memset (d, 0, clearsize);
+
+  else
+    {
+      *(d + 0) = 0;
+      *(d + 1) = 0;
+      *(d + 2) = 0;
+      if (nclears > 4)
+        {
+          *(d + 3) = 0;
+          *(d + 4) = 0;
+          if (nclears > 6)
+            {
+              *(d + 5) = 0;
+              *(d + 6) = 0;
+              if (nclears > 8)
+                {
+                  *(d + 7) = 0;
+                  *(d + 8) = 0;
+                }
+            }
+        }
+    }
+
+  return mem;
+}
+
 void *
 __libc_calloc (size_t n, size_t elem_size)
 {
   mstate av;
-  mchunkptr oldtop;
-  INTERNAL_SIZE_T sz, oldtopsize;
+  mchunkptr oldtop, p;
+  INTERNAL_SIZE_T sz, oldtopsize, csz;
   void *mem;
-  unsigned long clearsize;
-  unsigned long nclears;
-  INTERNAL_SIZE_T *d;
   ptrdiff_t bytes;
 
   if (__glibc_unlikely (__builtin_mul_overflow (n, elem_size, &bytes)))
@@ -3780,6 +3819,29 @@  __libc_calloc (size_t n, size_t elem_size)
 
   MAYBE_INIT_TCACHE ();
 
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = checked_request2size (bytes);
+  if (tbytes == 0)
+    {
+      __set_errno (ENOMEM);
+      return NULL;
+    }
+  size_t tc_idx = csize2tidx (tbytes);
+
+  if (tc_idx < mp_.tcache_bins
+      && tcache != NULL
+      && tcache->counts[tc_idx] > 0)
+    {
+      mem = tcache_get (tc_idx);
+      p = mem2chunk (mem);
+      if (__glibc_unlikely (mtag_enabled))
+	return tag_new_zero_region (mem, memsize (p));
+      csz = chunksize (p);
+      return clear_mem (mem, csz);
+    }
+#endif
+
   if (SINGLE_THREAD_P)
     av = &main_arena;
   else
@@ -3834,7 +3896,7 @@  __libc_calloc (size_t n, size_t elem_size)
   if (mem == 0)
     return 0;
 
-  mchunkptr p = mem2chunk (mem);
+  p = mem2chunk (mem);
 
   /* If we are using memory tagging, then we need to set the tags
      regardless of MORECORE_CLEARS, so we zero the whole block while
@@ -3842,7 +3904,7 @@  __libc_calloc (size_t n, size_t elem_size)
   if (__glibc_unlikely (mtag_enabled))
     return tag_new_zero_region (mem, memsize (p));
 
-  INTERNAL_SIZE_T csz = chunksize (p);
+  csz = chunksize (p);
 
   /* Two optional cases in which clearing not necessary */
   if (chunk_is_mmapped (p))
@@ -3861,40 +3923,7 @@  __libc_calloc (size_t n, size_t elem_size)
     }
 #endif
 
-  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
-     contents have an odd number of INTERNAL_SIZE_T-sized words;
-     minimally 3.  */
-  d = (INTERNAL_SIZE_T *) mem;
-  clearsize = csz - SIZE_SZ;
-  nclears = clearsize / sizeof (INTERNAL_SIZE_T);
-  assert (nclears >= 3);
-
-  if (nclears > 9)
-    return memset (d, 0, clearsize);
-
-  else
-    {
-      *(d + 0) = 0;
-      *(d + 1) = 0;
-      *(d + 2) = 0;
-      if (nclears > 4)
-        {
-          *(d + 3) = 0;
-          *(d + 4) = 0;
-          if (nclears > 6)
-            {
-              *(d + 5) = 0;
-              *(d + 6) = 0;
-              if (nclears > 8)
-                {
-                  *(d + 7) = 0;
-                  *(d + 8) = 0;
-                }
-            }
-        }
-    }
-
-  return mem;
+  return clear_mem (mem, csz);
 }
 #endif /* IS_IN (libc) */