@@ -153,7 +153,11 @@ _dl_close_worker (struct link_map *map, bool force)
maps[idx] = l;
++idx;
- /* Clear DF_1_NODELETE to force object deletion. */
+ /* Clear DF_1_NODELETE to force object deletion. We don't need to touch
+ l_tls_dtor_count because forced object deletion only happens when an
+ error occurs during object load. Destructor registration for TLS
+ non-POD objects should not have happened till then for this
+ object. */
if (force)
l->l_flags_1 &= ~DF_1_NODELETE;
}
@@ -177,6 +181,9 @@ _dl_close_worker (struct link_map *map, bool force)
if (l->l_type == lt_loaded
&& l->l_direct_opencount == 0
&& (l->l_flags_1 & DF_1_NODELETE) == 0
+ /* See CONCURRENCY NOTES in cxa_thread_atexit_impl.c to know why
+ acquire is sufficient and correct. */
+ && atomic_load_acquire (&l->l_tls_dtor_count) == 0
&& !used[done_index])
continue;
@@ -302,7 +302,9 @@ struct link_map
/* Index of the module in the dtv array. */
size_t l_tls_modid;
- /* Number of thread_local objects constructed by this DSO. */
+ /* Number of thread_local objects constructed by this DSO. This is
+ atomically accessed and modified and is not always protected by the load
+ lock. See also: CONCURRENCY NOTES in cxa_thread_atexit_impl.c. */
size_t l_tls_dtor_count;
/* Information used to change permission after the relocations are
@@ -74,7 +74,7 @@ tests := tst-strtol tst-strtod testmb testrand testsort testdiv \
tst-makecontext3 bug-getcontext bug-fmtmsg1 \
tst-secure-getenv tst-strtod-overflow tst-strtod-round \
tst-tininess tst-strtod-underflow tst-tls-atexit \
- tst-setcontext3
+ tst-setcontext3 tst-tls-atexit-nodelete
tests-static := tst-secure-getenv
modules-names = tst-tls-atexit-lib
@@ -159,6 +159,9 @@ tst-tls-atexit-lib.so-no-z-defs = yes
$(objpfx)tst-tls-atexit: $(shared-thread-library) $(libdl)
$(objpfx)tst-tls-atexit.out: $(objpfx)tst-tls-atexit-lib.so
+$(objpfx)tst-tls-atexit-nodelete: $(shared-thread-library) $(libdl)
+$(objpfx)tst-tls-atexit-nodelete.out: $(objpfx)tst-tls-atexit-lib.so
+
$(objpfx)tst-setcontext3.out: tst-setcontext3.sh $(objpfx)tst-setcontext3
$(SHELL) $< $(common-objpfx) '$(test-program-prefix-before-env)' \
'$(run-program-env)' '$(test-program-prefix-after-env)' \
@@ -16,6 +16,50 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+/* CONCURRENCY NOTES:
+
+ The functions __cxa_thread_atexit_impl, _dl_close_worker and
+ __call_tls_dtors are the three main routines that may run concurrently and
+ access shared data. The shared data in all possible combinations of all
+ three functions are the link map list, a link map for a DSO and the link map
+ member l_tls_dtor_count.
+
+ __cxa_thread_atexit_impl takes the load_lock before modifying any shared
+ state and hence can concurrently execute multiple of its instances in
+ different threads safely.
+
+ _dl_close_worker takes the load_lock before modifying any shared state as
+ well and hence can concurrently execute multiple of its own instances as
+ well as those of __cxa_thread_atexit_impl safely.
+
+ __call_tls_dtors does not take the load lock, but reads only the link map
+ of the current DSO and its member l_tls_dtor_count. It has to ensure that
+ l_tls_dtor_count is decremented atomically
+
+ Correspondingly, _dl_close_worker loads l_tls_dtor_count and if it is zero,
+ unloads the DSO, thus dereferencing the current link map. Hence, for
+ concurrent execution of _dl_close_worker and __call_tls_dtors, it is
+ necessary that:
+
+ 1. The DSO unload in _dl_close_worker happens after l_tls_dtor_count, i.e.
+ the l_tls_dtor_count load does not get reordered to after the DSO is
+ unloaded
+ 2. The link map dereference in __call_tls_dtors happens before the
+ l_tls_dtor_count dereference.
+
+ to ensure that we eliminate the inconsistent state where the DSO is unloaded
+ before it is dereferenced in __call_tls_dtors, which could happen if any of
+ the above two pairs of sequences were reordered.
+
+ To ensure this, the l_tls_dtor_count decrement in __call_tls_dtors should
+ have release semantics and the load in _dl_close_worker should have acquire
+ semantics.
+
+ Concurrent executions of __call_tls_dtors should only ensure that the value
+ is modified atomically; no reordering constraints need to be considered.
+ Likewise for the increment of l_tls_dtor_count in
+ __cxa_thread_atexit_impl. */
+
#include <stdlib.h>
#include <ldsodefs.h>
@@ -49,9 +93,11 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
new->next = tls_dtor_list;
tls_dtor_list = new;
- /* See if we already encountered the DSO. */
+ /* We have to take the big lock to prevent a racing dlclose from pulling our
+ DSO from underneath us while we're setting up our destructor. */
__rtld_lock_lock_recursive (GL(dl_load_lock));
+ /* See if we already encountered the DSO. */
if (__glibc_unlikely (dso_symbol_cache != dso_symbol))
{
ElfW(Addr) caller = (ElfW(Addr)) dso_symbol;
@@ -62,16 +108,14 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
program (we hope). */
lm_cache = l ? l : GL(dl_ns)[LM_ID_BASE]._ns_loaded;
}
- /* A destructor could result in a thread_local construction and the former
- could have cleared the flag. */
- if (lm_cache->l_type == lt_loaded && lm_cache->l_tls_dtor_count == 0)
- lm_cache->l_flags_1 |= DF_1_NODELETE;
-
- new->map = lm_cache;
- new->map->l_tls_dtor_count++;
+ /* Relaxed since the only shared state here is l_tls_dtor_count and hence
+ there are no reordering issues. */
+ atomic_fetch_add_relaxed (&lm_cache->l_tls_dtor_count, 1);
__rtld_lock_unlock_recursive (GL(dl_load_lock));
+ new->map = lm_cache;
+
return 0;
}
@@ -83,19 +127,15 @@ __call_tls_dtors (void)
while (tls_dtor_list)
{
struct dtor_list *cur = tls_dtor_list;
- tls_dtor_list = tls_dtor_list->next;
+ tls_dtor_list = tls_dtor_list->next;
cur->func (cur->obj);
- __rtld_lock_lock_recursive (GL(dl_load_lock));
-
- /* Allow DSO unload if count drops to zero. */
- cur->map->l_tls_dtor_count--;
- if (cur->map->l_tls_dtor_count == 0 && cur->map->l_type == lt_loaded)
- cur->map->l_flags_1 &= ~DF_1_NODELETE;
-
- __rtld_lock_unlock_recursive (GL(dl_load_lock));
-
+ /* Ensure that the MAP dereference is not reordered below the
+ l_tls_dtor_count decrement. This ensures that it synchronizes with
+ the load in _dl_close_worker and keeps this dereference before the DSO
+ unload. See CONCURRENCY NOTES for more detail. */
+ atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);
free (cur);
}
}
new file mode 100644
@@ -0,0 +1,104 @@
+/* Verify that a RTLD_NODELETE DSO is not unloaded even if its TLS objects are
+ destroyed.
+
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <dlfcn.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+
+static void *
+use_handle (void *h)
+{
+ void (*reg_dtor) (void) = (void (*) (void)) dlsym (h, "reg_dtor");
+
+ if (reg_dtor == NULL)
+ {
+ printf ("Unable to find symbol reg_dtor: %s\n", dlerror ());
+ return (void *) (uintptr_t) 1;
+ }
+
+ reg_dtor ();
+
+ return NULL;
+}
+
+static int
+do_test (void)
+{
+ pthread_t t;
+ int ret;
+ void *thr_ret;
+
+ void *h1 = dlopen ("$ORIGIN/tst-tls-atexit-lib.so", RTLD_LAZY);
+ if (h1 == NULL)
+ {
+ printf ("h1: Unable to load DSO: %s\n", dlerror ());
+ return 1;
+ }
+
+ void *h2 = dlopen ("$ORIGIN/tst-tls-atexit-lib.so",
+ RTLD_LAZY | RTLD_NODELETE);
+ if (h2 == NULL)
+ {
+ printf ("h2: Unable to load DSO: %s\n", dlerror ());
+ return 1;
+ }
+
+ /* FOO is our variable to test if the DSO is unloaded. */
+ int *foo = dlsym (h2, "foo_var");
+
+ if (foo == NULL)
+ {
+ printf ("Unable to find symbol do_foo: %s\n", dlerror ());
+ return 1;
+ }
+
+ /* Print FOO to be sure that it is working OK. */
+ printf ("%d\n", *foo);
+
+ if ((ret = pthread_create (&t, NULL, use_handle, h1)) != 0)
+ {
+ printf ("pthread_create failed: %s\n", strerror (ret));
+ return 1;
+ }
+
+ if ((ret = pthread_join (t, &thr_ret)) != 0)
+ {
+ printf ("pthread_join failed: %s\n", strerror (ret));
+ return 1;
+ }
+
+ if (thr_ret != NULL)
+ return 1;
+
+ /* Closing the handles should not unload the DSO. */
+ dlclose (h1);
+ dlclose (h2);
+
+ /* This should not crash. */
+ printf ("%d\n", *foo);
+
+ return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
@@ -99,9 +99,17 @@ do_test (void)
if (thr_ret != NULL)
return 1;
- /* Now this should unload the DSO. FIXME: This is a bug, calling dlclose
- like this is actually wrong, but it works because cxa_thread_atexit_impl
- has a bug which results in dlclose allowing this to work. */
+ /* Now this sequence should unload the DSO. Note that we don't call do_foo,
+ because we don't want to register the thread destructor. We just want to
+ make sure that in the absence of pending destructors, the library is
+ unloaded correctly. */
+ handle = dlopen ("$ORIGIN/tst-tls-atexit-lib.so", RTLD_LAZY);
+ if (handle == NULL)
+ {
+ printf ("2: Unable to load DSO: %s\n", dlerror ());
+ return 1;
+ }
+
dlclose (handle);
/* Handle SIGSEGV. We don't use the EXPECTED_SIGNAL macro because we want to