Skip to content

Commit 0d90b4d

Browse files
committed
atexit: skip TLS-cached pools once shutdown has freed them
`fr_atexit_thread_trigger_all()` runs each registered thread destructor on the calling (main) thread, including ones that free `_Thread_local` caches owned by threads outside our schedule (librdkafka's bg threads, perl, etc.). We can free their pool chunks but can't reset another thread's TLS slot, so the next call from those threads dereferences a dangling pointer and aborts with "Bad talloc magic value" - first seen in `_kafka_log_cb` from the "Terminating instance" debug line emitted inside `rd_kafka_destroy()` during `mod_detach`. Add `fr_atexit_thread_local_disable_alloc()` / ..._alloc_disabled()` in atexit.c as the single source of truth, called once from radiusd *before* the trigger. TLS-pool initialisers consult it before reading their slot and fall back to `talloc_*(NULL, ...)` when set: - log.c:fr_log_pool_init returns NULL - sbuff.c:sbuff_scratch_init returns NULL Other TLS pools registered the same way (md4/md5/hmac_*, strerror, talloc autofree) can opt in as crashes surface; the single flag means the fix is one extra check at the top of each initialiser.
1 parent f3d4c64 commit 0d90b4d

4 files changed

Lines changed: 67 additions & 5 deletions

File tree

src/lib/util/atexit.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ RCSID("$Id$")
3030
#include <freeradius-devel/util/dlist.h>
3131
#include <freeradius-devel/util/atexit.h>
3232

33+
#include <stdatomic.h>
34+
3335
#ifdef HAVE_PTHREADS
3436
#endif
3537

@@ -76,6 +78,24 @@ static fr_atexit_list_t *fr_atexit_global = NULL;
7678
static bool is_exiting;
7779
static _Thread_local bool thread_is_exiting;
7880

81+
/** Latched once main has decided no more thread-local pools should be handed out
82+
*
83+
* `fr_atexit_thread_trigger_all()` runs every registered thread destructor on
84+
* the calling (main) thread, including ones that free `_Thread_local` pools
85+
* owned by threads we don't manage (librdkafka's bg threads, perl, etc.). We
86+
* can free the underlying chunks but can't reset another thread's TLS slot,
87+
* so those threads keep a dangling pointer in their per-thread cache. Code
88+
* that lazily allocates a TLS-cached pool (log.c, sbuff.c, strerror.c, ...)
89+
* checks this flag *before* reading its TLS slot and falls back to
90+
* `talloc_*(NULL, ...)` (or returns NULL) when set, side-stepping the
91+
* dangling pointer entirely.
92+
*
93+
* Set once, never cleared. Relaxed atomic because this is a one-shot signal
94+
* with no other state synchronised through it - readers tolerate observing
95+
* the old value briefly.
96+
*/
97+
static atomic_bool thread_local_alloc_disabled;
98+
7999
/** Call the exit handler
80100
*
81101
*/
@@ -403,6 +423,27 @@ int fr_atexit_trigger(bool uctx_scope, fr_atexit_t func, void const *uctx)
403423
}
404424

405425

426+
/** Disable lazy allocation of thread-local caches for the rest of the process
427+
*
428+
* Call this from main *before* `fr_atexit_thread_trigger_all()`. After it
429+
* returns, every TLS-pool initialiser that consults
430+
* `fr_atexit_thread_local_alloc_disabled()` falls back to the un-cached path,
431+
* so `_Thread_local` slots in other threads aren't read after the trigger
432+
* frees their backing memory.
433+
*/
434+
void fr_atexit_thread_local_disable_alloc(void)
435+
{
436+
atomic_store_explicit(&thread_local_alloc_disabled, true, memory_order_relaxed);
437+
}
438+
439+
/** Has @ref fr_atexit_thread_local_disable_alloc been called yet
440+
*
441+
*/
442+
bool fr_atexit_thread_local_alloc_disabled(void)
443+
{
444+
return atomic_load_explicit(&thread_local_alloc_disabled, memory_order_relaxed);
445+
}
446+
406447
/** Return whether we're currently in the teardown phase
407448
*
408449
* When this function returns true no more thread local or global
@@ -602,6 +643,16 @@ int fr_atexit_thread_trigger_all(void)
602643
fr_atexit_list_t *list;
603644
unsigned int count = 0;
604645

646+
/*
647+
* Disable TLS-cached pool handouts before we start freeing
648+
* any of them. Threads we don't manage (librdkafka's bg
649+
* threads, etc.) keep dangling pointers in their
650+
* `_Thread_local` slots once the chunks here are reaped, so
651+
* any TLS-pool initialiser that consults the flag falls back
652+
* to `talloc_*(NULL, ...)` from now on.
653+
*/
654+
fr_atexit_thread_local_disable_alloc();
655+
605656
/*
606657
* Iterate over the list of thread local
607658
* destructor lists running the

src/lib/util/atexit.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ int fr_atexit_trigger(bool uctx_scope, fr_atexit_t func, void const *uctx);
6767

6868
bool fr_atexit_is_exiting(void);
6969

70+
void fr_atexit_thread_local_disable_alloc(void);
71+
72+
bool fr_atexit_thread_local_alloc_disabled(void);
73+
7074
#ifdef HAVE_PTHREADS
7175
/*
7276
* Because GCC only added support in 2013 *sigh*

src/lib/util/log.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,11 +347,11 @@ TALLOC_CTX *fr_log_pool_init(void)
347347
TALLOC_CTX *pool;
348348

349349
/*
350-
* Post-trigger of every thread atexit handler the TLS slot
351-
* may be a dangling pointer for any thread we don't own
352-
* (librdkafka's bg threads etc.) - skip the pool entirely.
350+
* Once main has signalled shutdown the TLS slot may be a
351+
* dangling pointer for any thread we don't own (librdkafka's
352+
* bg threads etc.) - skip the pool entirely.
353353
*/
354-
if (unlikely(atomic_load_explicit(&log_pools_disabled, memory_order_relaxed))) return NULL;
354+
if (unlikely(fr_atexit_thread_local_alloc_disabled())) return NULL;
355355

356356
pool = fr_log_pool;
357357
if (unlikely(!pool)) {

src/lib/util/sbuff.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1542,7 +1542,14 @@ static inline CC_HINT(always_inline) int sbuff_scratch_init(TALLOC_CTX **out)
15421542
{
15431543
TALLOC_CTX *scratch;
15441544

1545-
if (sbuff_scratch_freed) {
1545+
/*
1546+
* Once main has signalled shutdown the TLS slot may be a
1547+
* dangling pointer on threads we don't own; skip the scratch
1548+
* cache and let callers allocate at top level instead. The
1549+
* TLS-local `sbuff_scratch_freed` is left in place for the
1550+
* per-thread teardown path on FR-managed threads.
1551+
*/
1552+
if (sbuff_scratch_freed || fr_atexit_thread_local_alloc_disabled()) {
15461553
*out = NULL;
15471554
return 0;
15481555
}

0 commit comments

Comments
 (0)