diff mbox

[v5] Also use l_tls_dtor_count to decide on object unload (BZ #18657)

Message ID 1437389464-6876-1-git-send-email-siddhesh@redhat.com
State New
Headers show

Commit Message

Siddhesh Poyarekar July 20, 2015, 10:51 a.m. UTC
When an TLS destructor is registered, we set the DF_1_NODELETE flag to
signal that the object should not be destroyed.  We then clear the
DF_1_NODELETE flag when all destructors are called, which is wrong -
the flag could have been set by other means too.

This patch replaces this use of the flag by using l_tls_dtor_count
directly to determine whether it is safe to unload the object.  This
change has the added advantage of eliminating the lock taking when
calling the destructors, which could result in a deadlock.  The patch
also fixes the test case tst-tls-atexit - it was making an invalid
dlclose call, which would just return an error silently.

I have also added a detailed note on concurrency which also aims to
justify why I chose the semantics I chose for accesses to
l_tls_dtor_count.  Thanks to Torvald for his help in getting me
started on this and (literally) teaching my how to approach the
problem.

Change verified on x86_64; the test suite does not show any
regressions due to the patch.

ChangeLog:

	[BZ #18657]
	* elf/dl-close.c (_dl_close_worker): Don't unload DSO if there
	are pending TLS destructor calls.
	* stdlib/cxa_thread_atexit_impl.c (__cxa_thread_atexit_impl):
	Don't touch the link map flag.  Atomically increment
	l_tls_dtor_count.
	(__call_tls_dtors): Atomically decrement l_tls_dtor_count.
	Avoid taking the load lock and don't touch the link map flag.
	* stdlib/tst-tls-atexit.c (do_test): dlopen
	tst-tls-atexit-lib.so again before dlclose.
	* stdlib/tst-tls-atexit-nodelete.c: New test case.
	* stdlib/Makefile (tests): Use it.
---
 elf/dl-close.c                   |   9 +++-
 include/link.h                   |   4 +-
 stdlib/Makefile                  |   5 +-
 stdlib/cxa_thread_atexit_impl.c  |  76 +++++++++++++++++++++-------
 stdlib/tst-tls-atexit-nodelete.c | 104 +++++++++++++++++++++++++++++++++++++++
 stdlib/tst-tls-atexit.c          |  14 ++++--
 6 files changed, 188 insertions(+), 24 deletions(-)
 create mode 100644 stdlib/tst-tls-atexit-nodelete.c
diff mbox

Patch

diff --git a/elf/dl-close.c b/elf/dl-close.c
index 2104674..9105277 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -153,7 +153,11 @@  _dl_close_worker (struct link_map *map, bool force)
       maps[idx] = l;
       ++idx;
 
-      /* Clear DF_1_NODELETE to force object deletion.  */
+      /* Clear DF_1_NODELETE to force object deletion.  We don't need to touch
+	 l_tls_dtor_count because forced object deletion only happens when an
+	 error occurs during object load.  Destructor registration for TLS
+	 non-POD objects should not have happened till then for this
+	 object.  */
       if (force)
 	l->l_flags_1 &= ~DF_1_NODELETE;
     }
@@ -177,6 +181,9 @@  _dl_close_worker (struct link_map *map, bool force)
       if (l->l_type == lt_loaded
 	  && l->l_direct_opencount == 0
 	  && (l->l_flags_1 & DF_1_NODELETE) == 0
+	  /* See CONCURRENCY NOTES in cxa_thread_atexit_impl.c to know why
+	     acquire is sufficient and correct.  */
+	  && atomic_load_acquire (&l->l_tls_dtor_count) == 0
 	  && !used[done_index])
 	continue;
 
diff --git a/include/link.h b/include/link.h
index 423a695..7e7eb79 100644
--- a/include/link.h
+++ b/include/link.h
@@ -302,7 +302,9 @@  struct link_map
     /* Index of the module in the dtv array.  */
     size_t l_tls_modid;
 
-    /* Number of thread_local objects constructed by this DSO.  */
+    /* Number of thread_local objects constructed by this DSO.  This is
+       atomically accessed and modified and is not always protected by the load
+       lock.  See also: CONCURRENCY NOTES in cxa_thread_atexit_impl.c.  */
     size_t l_tls_dtor_count;
 
     /* Information used to change permission after the relocations are
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 7fc5a80..402466a 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -74,7 +74,7 @@  tests		:= tst-strtol tst-strtod testmb testrand testsort testdiv   \
 		   tst-makecontext3 bug-getcontext bug-fmtmsg1		    \
 		   tst-secure-getenv tst-strtod-overflow tst-strtod-round   \
 		   tst-tininess tst-strtod-underflow tst-tls-atexit	    \
-		   tst-setcontext3
+		   tst-setcontext3 tst-tls-atexit-nodelete
 tests-static	:= tst-secure-getenv
 
 modules-names	= tst-tls-atexit-lib
@@ -159,6 +159,9 @@  tst-tls-atexit-lib.so-no-z-defs = yes
 $(objpfx)tst-tls-atexit: $(shared-thread-library) $(libdl)
 $(objpfx)tst-tls-atexit.out: $(objpfx)tst-tls-atexit-lib.so
 
+$(objpfx)tst-tls-atexit-nodelete: $(shared-thread-library) $(libdl)
+$(objpfx)tst-tls-atexit-nodelete.out: $(objpfx)tst-tls-atexit-lib.so
+
 $(objpfx)tst-setcontext3.out: tst-setcontext3.sh $(objpfx)tst-setcontext3
 	$(SHELL) $< $(common-objpfx) '$(test-program-prefix-before-env)' \
 		 '$(run-program-env)' '$(test-program-prefix-after-env)' \
diff --git a/stdlib/cxa_thread_atexit_impl.c b/stdlib/cxa_thread_atexit_impl.c
index 7a7d3d6..dc51ab5 100644
--- a/stdlib/cxa_thread_atexit_impl.c
+++ b/stdlib/cxa_thread_atexit_impl.c
@@ -16,6 +16,50 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+/* CONCURRENCY NOTES:
+
+   The functions __cxa_thread_atexit_impl, _dl_close_worker and
+   __call_tls_dtors are the three main routines that may run concurrently and
+   access shared data.  The shared data in all possible combinations of all
+   three functions are the link map list, a link map for a DSO and the link map
+   member l_tls_dtor_count.
+
+   __cxa_thread_atexit_impl takes the load_lock before modifying any shared
+   state and hence can concurrently execute multiple of its instances in
+   different threads safely.
+
+   _dl_close_worker takes the load_lock before modifying any shared state as
+   well and hence can concurrently execute multiple of its own instances as
+   well as those of __cxa_thread_atexit_impl safely.
+
+   __call_tls_dtors does not take the load lock, but reads only the link map
+   of the current DSO and its member l_tls_dtor_count.  It has to ensure that
+   l_tls_dtor_count is decremented atomically
+
+   Correspondingly, _dl_close_worker loads l_tls_dtor_count and if it is zero,
+   unloads the DSO, thus dereferencing the current link map.  Hence, for
+   concurrent execution of _dl_close_worker and __call_tls_dtors, it is
+   necessary that:
+
+   1. The DSO unload in _dl_close_worker happens after l_tls_dtor_count, i.e.
+      the l_tls_dtor_count load does not get reordered to after the DSO is
+      unloaded
+   2. The link map dereference in __call_tls_dtors happens before the
+      l_tls_dtor_count dereference.
+
+   to ensure that we eliminate the inconsistent state where the DSO is unloaded
+   before it is dereferenced in __call_tls_dtors, which could happen if any of
+   the above two pairs of sequences were reordered.
+
+   To ensure this, the l_tls_dtor_count decrement in __call_tls_dtors should
+   have release semantics and the load in _dl_close_worker should have acquire
+   semantics.
+
+   Concurrent executions of __call_tls_dtors should only ensure that the value
+   is modified atomically; no reordering constraints need to be considered.
+   Likewise for the increment of l_tls_dtor_count in
+   __cxa_thread_atexit_impl.  */
+
 #include <stdlib.h>
 #include <ldsodefs.h>
 
@@ -49,9 +93,11 @@  __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
   new->next = tls_dtor_list;
   tls_dtor_list = new;
 
-  /* See if we already encountered the DSO.  */
+  /* We have to take the big lock to prevent a racing dlclose from pulling our
+     DSO from underneath us while we're setting up our destructor.  */
   __rtld_lock_lock_recursive (GL(dl_load_lock));
 
+  /* See if we already encountered the DSO.  */
   if (__glibc_unlikely (dso_symbol_cache != dso_symbol))
     {
       ElfW(Addr) caller = (ElfW(Addr)) dso_symbol;
@@ -62,16 +108,14 @@  __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
 	 program (we hope).  */
       lm_cache = l ? l : GL(dl_ns)[LM_ID_BASE]._ns_loaded;
     }
-  /* A destructor could result in a thread_local construction and the former
-     could have cleared the flag.  */
-  if (lm_cache->l_type == lt_loaded && lm_cache->l_tls_dtor_count == 0)
-    lm_cache->l_flags_1 |= DF_1_NODELETE;
-
-  new->map = lm_cache;
-  new->map->l_tls_dtor_count++;
 
+  /* Relaxed since the only shared state here is l_tls_dtor_count and hence
+     there are no reordering issues.  */
+  atomic_fetch_add_relaxed (&lm_cache->l_tls_dtor_count, 1);
   __rtld_lock_unlock_recursive (GL(dl_load_lock));
 
+  new->map = lm_cache;
+
   return 0;
 }
 
@@ -83,19 +127,15 @@  __call_tls_dtors (void)
   while (tls_dtor_list)
     {
       struct dtor_list *cur = tls_dtor_list;
-      tls_dtor_list = tls_dtor_list->next;
 
+      tls_dtor_list = tls_dtor_list->next;
       cur->func (cur->obj);
 
-      __rtld_lock_lock_recursive (GL(dl_load_lock));
-
-      /* Allow DSO unload if count drops to zero.  */
-      cur->map->l_tls_dtor_count--;
-      if (cur->map->l_tls_dtor_count == 0 && cur->map->l_type == lt_loaded)
-        cur->map->l_flags_1 &= ~DF_1_NODELETE;
-
-      __rtld_lock_unlock_recursive (GL(dl_load_lock));
-
+      /* Ensure that the MAP dereference is not reordered below the
+	 l_tls_dtor_count decrement.  This ensures that it synchronizes with
+	 the load in _dl_close_worker and keeps this dereference before the DSO
+	 unload.  See CONCURRENCY NOTES for more detail.  */
+      atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);
       free (cur);
     }
 }
diff --git a/stdlib/tst-tls-atexit-nodelete.c b/stdlib/tst-tls-atexit-nodelete.c
new file mode 100644
index 0000000..cb2b39d
--- /dev/null
+++ b/stdlib/tst-tls-atexit-nodelete.c
@@ -0,0 +1,104 @@ 
+/* Verify that a RTLD_NODELETE DSO is not unloaded even if its TLS objects are
+   destroyed.
+
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+static void *
+use_handle (void *h)
+{
+  void (*reg_dtor) (void) = (void (*) (void)) dlsym (h, "reg_dtor");
+
+  if (reg_dtor == NULL)
+    {
+      printf ("Unable to find symbol reg_dtor: %s\n", dlerror ());
+      return (void *) (uintptr_t) 1;
+    }
+
+  reg_dtor ();
+
+  return NULL;
+}
+
+static int
+do_test (void)
+{
+  pthread_t t;
+  int ret;
+  void *thr_ret;
+
+  void *h1 = dlopen ("$ORIGIN/tst-tls-atexit-lib.so", RTLD_LAZY);
+  if (h1 == NULL)
+    {
+      printf ("h1: Unable to load DSO: %s\n", dlerror ());
+      return 1;
+    }
+
+  void *h2 = dlopen ("$ORIGIN/tst-tls-atexit-lib.so",
+		     RTLD_LAZY | RTLD_NODELETE);
+  if (h2 == NULL)
+    {
+      printf ("h2: Unable to load DSO: %s\n", dlerror ());
+      return 1;
+    }
+
+  /* FOO is our variable to test if the DSO is unloaded.  */
+  int *foo = dlsym (h2, "foo_var");
+
+  if (foo == NULL)
+    {
+      printf ("Unable to find symbol do_foo: %s\n", dlerror ());
+      return 1;
+    }
+
+  /* Print FOO to be sure that it is working OK.  */
+  printf ("%d\n", *foo);
+
+  if ((ret = pthread_create (&t, NULL, use_handle, h1)) != 0)
+    {
+      printf ("pthread_create failed: %s\n", strerror (ret));
+      return 1;
+    }
+
+  if ((ret = pthread_join (t, &thr_ret)) != 0)
+    {
+      printf ("pthread_join failed: %s\n", strerror (ret));
+      return 1;
+    }
+
+  if (thr_ret != NULL)
+    return 1;
+
+  /* Closing the handles should not unload the DSO.  */
+  dlclose (h1);
+  dlclose (h2);
+
+  /* This should not crash.  */
+  printf ("%d\n", *foo);
+
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdlib/tst-tls-atexit.c b/stdlib/tst-tls-atexit.c
index 0a9e920..ccea8a7 100644
--- a/stdlib/tst-tls-atexit.c
+++ b/stdlib/tst-tls-atexit.c
@@ -99,9 +99,17 @@  do_test (void)
   if (thr_ret != NULL)
     return 1;
 
-  /* Now this should unload the DSO.  FIXME: This is a bug, calling dlclose
-     like this is actually wrong, but it works because cxa_thread_atexit_impl
-     has a bug which results in dlclose allowing this to work.  */
+  /* Now this sequence should unload the DSO.  Note that we don't call do_foo,
+     because we don't want to register the thread destructor.  We just want to
+     make sure that in the absence of pending destructors, the library is
+     unloaded correctly.  */
+  handle = dlopen ("$ORIGIN/tst-tls-atexit-lib.so", RTLD_LAZY);
+  if (handle == NULL)
+    {
+      printf ("2: Unable to load DSO: %s\n", dlerror ());
+      return 1;
+    }
+
   dlclose (handle);
 
   /* Handle SIGSEGV.  We don't use the EXPECTED_SIGNAL macro because we want to