Skip to content

Commit f911c1a

Browse files
committed
atexit: skip TLS-cached pools once shutdown has freed them
`fr_atexit_thread_trigger_all()` runs each registered thread destructor on the calling (main) thread, including ones that free `_Thread_local` caches owned by threads outside our schedule (librdkafka's bg threads, perl, etc.). We can free their pool chunks but can't reset another thread's TLS slot, so the next call from those threads dereferences a dangling pointer and aborts with "Bad talloc magic value" - first seen in `_kafka_log_cb` from the "Terminating instance" debug line emitted inside `rd_kafka_destroy()` during `mod_detach`. Add `fr_atexit_thread_local_disable_alloc()` / ..._alloc_disabled()` in atexit.c as the single source of truth, called once from radiusd *before* the trigger. TLS-pool initialisers consult it before reading their slot and fall back to `talloc_*(NULL, ...)` when set: - log.c:fr_log_pool_init returns NULL - sbuff.c:sbuff_scratch_init returns NULL Other TLS pools registered the same way (md4/md5/hmac_*, strerror, talloc autofree) can opt in as crashes surface; the single flag means the fix is one extra check at the top of each initialiser.
1 parent 11b8427 commit f911c1a

4 files changed

Lines changed: 72 additions & 2 deletions

File tree

src/lib/util/atexit.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ RCSID("$Id$")
3030
#include <freeradius-devel/util/dlist.h>
3131
#include <freeradius-devel/util/atexit.h>
3232

33+
#include <stdatomic.h>
34+
3335
#ifdef HAVE_PTHREADS
3436
#endif
3537

@@ -76,6 +78,24 @@ static fr_atexit_list_t *fr_atexit_global = NULL;
7678
static bool is_exiting;
7779
static _Thread_local bool thread_is_exiting;
7880

81+
/** Latched once main has decided no more thread-local pools should be handed out
82+
*
83+
* `fr_atexit_thread_trigger_all()` runs every registered thread destructor on
84+
* the calling (main) thread, including ones that free `_Thread_local` pools
85+
* owned by threads we don't manage (librdkafka's bg threads, perl, etc.). We
86+
* can free the underlying chunks but can't reset another thread's TLS slot,
87+
* so those threads keep a dangling pointer in their per-thread cache. Code
88+
* that lazily allocates a TLS-cached pool (log.c, sbuff.c, strerror.c, ...)
89+
* checks this flag *before* reading its TLS slot and falls back to
90+
* `talloc_*(NULL, ...)` (or returns NULL) when set, side-stepping the
91+
* dangling pointer entirely.
92+
*
93+
* Set once, never cleared. Relaxed atomic because this is a one-shot signal
94+
* with no other state synchronised through it - readers tolerate observing
95+
* the old value briefly.
96+
*/
97+
static atomic_bool thread_local_alloc_disabled;
98+
7999
/** Call the exit handler
80100
*
81101
*/
@@ -403,6 +423,27 @@ int fr_atexit_trigger(bool uctx_scope, fr_atexit_t func, void const *uctx)
403423
}
404424

405425

426+
/** Disable lazy allocation of thread-local caches for the rest of the process
427+
*
428+
* Call this from main *before* `fr_atexit_thread_trigger_all()`. After it
429+
* returns, every TLS-pool initialiser that consults
430+
* `fr_atexit_thread_local_alloc_disabled()` falls back to the un-cached path,
431+
* so `_Thread_local` slots in other threads aren't read after the trigger
432+
* frees their backing memory.
433+
*/
434+
void fr_atexit_thread_local_disable_alloc(void)
435+
{
436+
atomic_store_explicit(&thread_local_alloc_disabled, true, memory_order_relaxed);
437+
}
438+
439+
/** Has @ref fr_atexit_thread_local_disable_alloc been called yet
440+
*
441+
*/
442+
bool fr_atexit_thread_local_alloc_disabled(void)
443+
{
444+
return atomic_load_explicit(&thread_local_alloc_disabled, memory_order_relaxed);
445+
}
446+
406447
/** Return whether we're currently in the teardown phase
407448
*
408449
* When this function returns true no more thread local or global
@@ -602,6 +643,16 @@ int fr_atexit_thread_trigger_all(void)
602643
fr_atexit_list_t *list;
603644
unsigned int count = 0;
604645

646+
/*
647+
* Disable TLS-cached pool handouts before we start freeing
648+
* any of them. Threads we don't manage (librdkafka's bg
649+
* threads, etc.) keep dangling pointers in their
650+
* `_Thread_local` slots once the chunks here are reaped, so
651+
* any TLS-pool initialiser that consults the flag falls back
652+
* to `talloc_*(NULL, ...)` from now on.
653+
*/
654+
fr_atexit_thread_local_disable_alloc();
655+
605656
/*
606657
* Iterate over the list of thread local
607658
* destructor lists running the

src/lib/util/atexit.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ int fr_atexit_trigger(bool uctx_scope, fr_atexit_t func, void const *uctx);
6767

6868
bool fr_atexit_is_exiting(void);
6969

70+
void fr_atexit_thread_local_disable_alloc(void);
71+
72+
bool fr_atexit_thread_local_alloc_disabled(void);
73+
7074
#ifdef HAVE_PTHREADS
7175
/*
7276
* Because GCC only added support in 2013 *sigh*

src/lib/util/log.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,12 +308,20 @@ static int _fr_log_pool_free(void *arg)
308308
* Functions must ensure that they allocate a new ctx from the one returned
309309
* here, and that this ctx is freed before the function returns.
310310
*
311-
* @return talloc pool to use for scratch space.
311+
* @return talloc pool to use for scratch space, or NULL if thread-local
312+
* allocation has been disabled - callers must tolerate a NULL return.
312313
*/
313314
TALLOC_CTX *fr_log_pool_init(void)
314315
{
315316
TALLOC_CTX *pool;
316317

318+
/*
319+
* Once main has signalled shutdown the TLS slot may be a
320+
* dangling pointer for any thread we don't own (librdkafka's
321+
* bg threads etc.) - skip the pool entirely.
322+
*/
323+
if (unlikely(fr_atexit_thread_local_alloc_disabled())) return NULL;
324+
317325
pool = fr_log_pool;
318326
if (unlikely(!pool)) {
319327
if (fr_atexit_is_exiting()) return NULL; /* No new pools if we're exiting */

src/lib/util/sbuff.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1542,7 +1542,14 @@ static inline CC_HINT(always_inline) int sbuff_scratch_init(TALLOC_CTX **out)
15421542
{
15431543
TALLOC_CTX *scratch;
15441544

1545-
if (sbuff_scratch_freed) {
1545+
/*
1546+
* Once main has signalled shutdown the TLS slot may be a
1547+
* dangling pointer on threads we don't own; skip the scratch
1548+
* cache and let callers allocate at top level instead. The
1549+
* TLS-local `sbuff_scratch_freed` is left in place for the
1550+
* per-thread teardown path on FR-managed threads.
1551+
*/
1552+
if (sbuff_scratch_freed || fr_atexit_thread_local_alloc_disabled()) {
15461553
*out = NULL;
15471554
return 0;
15481555
}

0 commit comments

Comments
 (0)