Also use l_tls_dtor_count to decide on object unload (BZ #18657)

When an TLS destructor is registered, we set the DF_1_NODELETE flag to
signal that the object should not be destroyed.  We then clear the
DF_1_NODELETE flag when all destructors are called, which is wrong -
the flag could have been set by other means too.

This patch replaces this use of the flag by using l_tls_dtor_count
directly to determine whether it is safe to unload the object.  This
change has the added advantage of eliminating the lock taking when
calling the destructors, which could result in a deadlock.  The patch
also fixes the test case tst-tls-atexit - it was making an invalid
dlclose call, which would just return an error silently.

I have also added a detailed note on concurrency which also aims to
justify why I chose the semantics I chose for accesses to
l_tls_dtor_count.  Thanks to Torvald for his help in getting me
started on this and (literally) teaching my how to approach the
problem.

Change verified on x86_64; the test suite does not show any
regressions due to the patch.

ChangeLog:

	[BZ #18657]
	* elf/dl-close.c (_dl_close_worker): Don't unload DSO if there
	are pending TLS destructor calls.
	* include/link.h (struct link_map): Add concurrency note for
	L_TLS_DTOR_COUNT.
	* stdlib/cxa_thread_atexit_impl.c (__cxa_thread_atexit_impl):
	Don't touch the link map flag.  Atomically increment
	l_tls_dtor_count.
	(__call_tls_dtors): Atomically decrement l_tls_dtor_count.
	Avoid taking the load lock and don't touch the link map flag.
	* stdlib/tst-tls-atexit-nodelete.c: New test case.
	* stdlib/Makefile (tests): Use it.
	* stdlib/tst-tls-atexit.c (do_test): dlopen
	tst-tls-atexit-lib.so again before dlclose.  Add conditionals
	to allow tst-tls-atexit-nodelete test case to use it.
This commit is contained in:
Siddhesh Poyarekar 2015-07-23 11:16:18 +05:30
parent 9c9184b449
commit 90b37cac8b
8 changed files with 177 additions and 37 deletions

View File

@ -1,3 +1,21 @@
2015-07-23 Siddhesh Poyarekar <siddhesh@redhat.com>
[BZ #18657]
* elf/dl-close.c (_dl_close_worker): Don't unload DSO if there
are pending TLS destructor calls.
* include/link.h (struct link_map): Add concurrency note for
L_TLS_DTOR_COUNT.
* stdlib/cxa_thread_atexit_impl.c (__cxa_thread_atexit_impl):
Don't touch the link map flag. Atomically increment
l_tls_dtor_count.
(__call_tls_dtors): Atomically decrement l_tls_dtor_count.
Avoid taking the load lock and don't touch the link map flag.
* stdlib/tst-tls-atexit-nodelete.c: New test case.
* stdlib/Makefile (tests): Use it.
* stdlib/tst-tls-atexit.c (do_test): dlopen
tst-tls-atexit-lib.so again before dlclose. Add conditionals
to allow tst-tls-atexit-nodelete test case to use it.
2015-07-22 Mike Frysinger <vapier@gentoo.org>
* sysdeps/unix/sysv/linux/ia64/bits/msq.h: Change sys/types.h include

2
NEWS
View File

@ -27,7 +27,7 @@ Version 2.22
18522, 18527, 18528, 18529, 18530, 18532, 18533, 18534, 18536, 18539,
18540, 18542, 18544, 18545, 18546, 18547, 18549, 18553, 18557, 18558,
18569, 18583, 18585, 18586, 18592, 18593, 18594, 18602, 18612, 18613,
18619, 18633, 18641, 18643, 18648, 18676, 18694, 18696.
18619, 18633, 18641, 18643, 18648, 18657, 18676, 18694, 18696.
* Cache information can be queried via sysconf() function on s390 e.g. with
_SC_LEVEL1_ICACHE_SIZE as argument.

View File

@ -153,7 +153,11 @@ _dl_close_worker (struct link_map *map, bool force)
maps[idx] = l;
++idx;
/* Clear DF_1_NODELETE to force object deletion. */
/* Clear DF_1_NODELETE to force object deletion. We don't need to touch
l_tls_dtor_count because forced object deletion only happens when an
error occurs during object load. Destructor registration for TLS
non-POD objects should not have happened till then for this
object. */
if (force)
l->l_flags_1 &= ~DF_1_NODELETE;
}
@ -177,6 +181,9 @@ _dl_close_worker (struct link_map *map, bool force)
if (l->l_type == lt_loaded
&& l->l_direct_opencount == 0
&& (l->l_flags_1 & DF_1_NODELETE) == 0
/* See CONCURRENCY NOTES in cxa_thread_atexit_impl.c to know why
acquire is sufficient and correct. */
&& atomic_load_acquire (&l->l_tls_dtor_count) == 0
&& !used[done_index])
continue;

View File

@ -302,7 +302,9 @@ struct link_map
/* Index of the module in the dtv array. */
size_t l_tls_modid;
/* Number of thread_local objects constructed by this DSO. */
/* Number of thread_local objects constructed by this DSO. This is
atomically accessed and modified and is not always protected by the load
lock. See also: CONCURRENCY NOTES in cxa_thread_atexit_impl.c. */
size_t l_tls_dtor_count;
/* Information used to change permission after the relocations are

View File

@ -74,7 +74,7 @@ tests := tst-strtol tst-strtod testmb testrand testsort testdiv \
tst-makecontext3 bug-getcontext bug-fmtmsg1 \
tst-secure-getenv tst-strtod-overflow tst-strtod-round \
tst-tininess tst-strtod-underflow tst-tls-atexit \
tst-setcontext3
tst-setcontext3 tst-tls-atexit-nodelete
tests-static := tst-secure-getenv
modules-names = tst-tls-atexit-lib
@ -159,6 +159,9 @@ tst-tls-atexit-lib.so-no-z-defs = yes
$(objpfx)tst-tls-atexit: $(shared-thread-library) $(libdl)
$(objpfx)tst-tls-atexit.out: $(objpfx)tst-tls-atexit-lib.so
$(objpfx)tst-tls-atexit-nodelete: $(shared-thread-library) $(libdl)
$(objpfx)tst-tls-atexit-nodelete.out: $(objpfx)tst-tls-atexit-lib.so
$(objpfx)tst-setcontext3.out: tst-setcontext3.sh $(objpfx)tst-setcontext3
$(SHELL) $< $(common-objpfx) '$(test-program-prefix-before-env)' \
'$(run-program-env)' '$(test-program-prefix-after-env)' \

View File

@ -16,6 +16,61 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* CONCURRENCY NOTES:
This documents concurrency for the non-POD TLS destructor registration,
calling and destruction. The functions __cxa_thread_atexit_impl,
_dl_close_worker and __call_tls_dtors are the three main routines that may
run concurrently and access shared data. The shared data in all possible
combinations of all three functions are the link map list, a link map for a
DSO and the link map member l_tls_dtor_count.
__cxa_thread_atexit_impl acquires the load_lock before accessing any shared
state and hence multiple of its instances can safely execute concurrently.
_dl_close_worker acquires the load_lock before accessing any shared state as
well and hence can concurrently execute multiple of its own instances as
well as those of __cxa_thread_atexit_impl safely. Not all accesses to
l_tls_dtor_count are protected by the load lock, so we need to synchronize
using atomics.
__call_tls_dtors accesses the l_tls_dtor_count without taking the lock; it
decrements the value by one. It does not need the big lock because it does
not access any other shared state except for the current DSO link map and
its member l_tls_dtor_count.
Correspondingly, _dl_close_worker loads l_tls_dtor_count and if it is zero,
unloads the DSO, thus deallocating the current link map. This is the goal
of maintaining l_tls_dtor_count - to unload the DSO and free resources if
there are no pending destructors to be called.
We want to eliminate the inconsistent state where the DSO is unloaded in
_dl_close_worker before it is used in __call_tls_dtors. This could happen
if __call_tls_dtors uses the link map after it sets l_tls_dtor_count to 0,
since _dl_close_worker will conclude from the 0 l_tls_dtor_count value that
it is safe to unload the DSO. Hence, to ensure that this does not happen,
the following conditions must be met:
1. In _dl_close_worker, the l_tls_dtor_count load happens before the DSO
is unload and its link map is freed
2. The link map dereference in __call_tls_dtors happens before the
l_tls_dtor_count dereference.
To ensure this, the l_tls_dtor_count decrement in __call_tls_dtors should
have release semantics and the load in _dl_close_worker should have acquire
semantics.
Concurrent executions of __call_tls_dtors should only ensure that the value
is accessed atomically; no reordering constraints need to be considered.
Likewise for the increment of l_tls_dtor_count in __cxa_thread_atexit_impl.
There is still a possibility on concurrent execution of _dl_close_worker and
__call_tls_dtors where _dl_close_worker reads the value of l_tls_dtor_count
as 1, __call_tls_dtors decrements the value of l_tls_dtor_count but
_dl_close_worker does not unload the DSO, having read the old value. This
is not very different from a case where __call_tls_dtors is called after
_dl_close_worker on the DSO and hence is an accepted execution. */
#include <stdlib.h>
#include <ldsodefs.h>
@ -49,9 +104,11 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
new->next = tls_dtor_list;
tls_dtor_list = new;
/* See if we already encountered the DSO. */
/* We have to acquire the big lock to prevent a racing dlclose from pulling
our DSO from underneath us while we're setting up our destructor. */
__rtld_lock_lock_recursive (GL(dl_load_lock));
/* See if we already encountered the DSO. */
if (__glibc_unlikely (dso_symbol_cache != dso_symbol))
{
ElfW(Addr) caller = (ElfW(Addr)) dso_symbol;
@ -62,15 +119,16 @@ __cxa_thread_atexit_impl (dtor_func func, void *obj, void *dso_symbol)
program (we hope). */
lm_cache = l ? l : GL(dl_ns)[LM_ID_BASE]._ns_loaded;
}
/* A destructor could result in a thread_local construction and the former
could have cleared the flag. */
if (lm_cache->l_type == lt_loaded && lm_cache->l_tls_dtor_count == 0)
lm_cache->l_flags_1 |= DF_1_NODELETE;
/* This increment may only be concurrently observed either by the decrement
in __call_tls_dtors since the other l_tls_dtor_count access in
_dl_close_worker is protected by the load lock. The execution in
__call_tls_dtors does not really depend on this value beyond the fact that
it should be atomic, so Relaxed MO should be sufficient. */
atomic_fetch_add_relaxed (&lm_cache->l_tls_dtor_count, 1);
__rtld_lock_unlock_recursive (GL(dl_load_lock));
new->map = lm_cache;
new->map->l_tls_dtor_count++;
__rtld_lock_unlock_recursive (GL(dl_load_lock));
return 0;
}
@ -83,19 +141,15 @@ __call_tls_dtors (void)
while (tls_dtor_list)
{
struct dtor_list *cur = tls_dtor_list;
tls_dtor_list = tls_dtor_list->next;
tls_dtor_list = tls_dtor_list->next;
cur->func (cur->obj);
__rtld_lock_lock_recursive (GL(dl_load_lock));
/* Allow DSO unload if count drops to zero. */
cur->map->l_tls_dtor_count--;
if (cur->map->l_tls_dtor_count == 0 && cur->map->l_type == lt_loaded)
cur->map->l_flags_1 &= ~DF_1_NODELETE;
__rtld_lock_unlock_recursive (GL(dl_load_lock));
/* Ensure that the MAP dereference happens before
l_tls_dtor_count decrement. That way, we protect this access from a
potential DSO unload in _dl_close_worker, which happens when
l_tls_dtor_count is 0. See CONCURRENCY NOTES for more detail. */
atomic_fetch_add_release (&cur->map->l_tls_dtor_count, -1);
free (cur);
}
}

View File

@ -0,0 +1,24 @@
/* Verify that a RTLD_NODELETE DSO is not unloaded even if its TLS objects are
destroyed.
Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#define NO_DELETE 1
#define H2_RTLD_FLAGS (RTLD_LAZY | RTLD_NODELETE)
#define LOADED_IS_GOOD true
#include "tst-tls-atexit.c"

View File

@ -16,12 +16,20 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* This test dynamically loads a DSO and spawns a thread that subsequently
calls into the DSO to register a destructor for an object in the DSO and
then calls dlclose on the handle for the DSO. When the thread exits, the
DSO should not be unloaded or else the destructor called during thread exit
will crash. Further in the main thread, the DSO is opened and closed again,
at which point the DSO should be unloaded. */
/* For the default case, i.e. NO_DELETE not defined, the test dynamically loads
a DSO and spawns a thread that subsequently calls into the DSO to register a
destructor for an object in the DSO and then calls dlclose on the handle for
the DSO. When the thread exits, the DSO should not be unloaded or else the
destructor called during thread exit will crash. Further in the main
thread, the DSO is opened and closed again, at which point the DSO should be
unloaded.
When NO_DELETE is defined, the DSO is loaded twice, once with just RTLD_LAZY
flag and the second time with the RTLD_NODELETE flag set. The thread is
spawned, destructor registered and then thread exits without closing the
DSO. In the main thread, the first handle is then closed, followed by the
second handle. In the end, the DSO should remain loaded due to the
RTLD_NODELETE flag being set in the second dlopen call. */
#include <dlfcn.h>
#include <pthread.h>
@ -31,6 +39,14 @@
#include <errno.h>
#include <link.h>
#ifndef NO_DELETE
# define LOADED_IS_GOOD false
#endif
#ifndef H2_RTLD_FLAGS
# define H2_RTLD_FLAGS (RTLD_LAZY)
#endif
#define DSO_NAME "$ORIGIN/tst-tls-atexit-lib.so"
/* Walk through the map in the _r_debug structure to see if our lib is still
@ -43,7 +59,10 @@ is_loaded (void)
for (; lm; lm = lm->l_next)
if (lm->l_type == lt_loaded && lm->l_name
&& strcmp (basename (DSO_NAME), basename (lm->l_name)) == 0)
return true;
{
printf ("%s is still loaded\n", lm->l_name);
return true;
}
return false;
}
@ -63,7 +82,9 @@ reg_dtor_and_close (void *h)
reg_dtor ();
#ifndef NO_DELETE
dlclose (h);
#endif
return NULL;
}
@ -104,19 +125,30 @@ do_test (void)
return 1;
}
#ifndef NO_DELETE
if (spawn_thread (h1) != 0)
return 1;
#endif
void *h2 = dlopen (DSO_NAME, H2_RTLD_FLAGS);
if (h2 == NULL)
{
printf ("h2: Unable to load DSO: %s\n", dlerror ());
return 1;
}
#ifdef NO_DELETE
if (spawn_thread (h1) != 0)
return 1;
/* Now this should unload the DSO. FIXME: This is a bug, calling dlclose
like this is actually wrong, but it works because cxa_thread_atexit_impl
has a bug which results in dlclose allowing this to work. */
dlclose (h1);
#endif
dlclose (h2);
/* Check link maps to ensure that the DSO has unloaded. */
if (is_loaded ())
return 1;
return 0;
/* Check link maps to ensure that the DSO has unloaded. In the normal case,
the DSO should be unloaded if there are no uses. However, if one of the
dlopen calls were with RTLD_NODELETE, the DSO should remain loaded. */
return is_loaded () == LOADED_IS_GOOD ? 0 : 1;
}
#define TEST_FUNCTION do_test ()