Skip to content

Commit c0b9437

Browse files
Fix lost logcontext when using timeout_deferred(...) (#19090)
Fix lost logcontext when using `timeout_deferred(...)` and things actually timeout. Fix #19087 (our HTTP client times out requests using `timeout_deferred(...)` Fix #19066 (`/sync` uses `notifier.wait_for_events()` which uses `timeout_deferred(...)` under the hood) ### When/why did these lost logcontext warnings start happening? ``` synapse.logging.context - 107 - WARNING - sentinel - Expected logging context call_later but found POST-2453 synapse.logging.context - 107 - WARNING - sentinel - Expected logging context call_later was lost ``` In #18828, we switched `timeout_deferred(...)` from using `reactor.callLater(...)` to [`clock.call_later(...)`](https://github.com/element-hq/synapse/blob/3b59ac3b69f6a2f73a504699b30313d8dcfe4709/synapse/util/clock.py#L224-L313) under the hood. This meant it started dealing with logcontexts but our `time_it_out()` callback didn't follow our [Synapse logcontext rules](https://github.com/element-hq/synapse/blob/3b59ac3b69f6a2f73a504699b30313d8dcfe4709/docs/log_contexts.md).
1 parent f0aae62 commit c0b9437

6 files changed

Lines changed: 89 additions & 10 deletions

File tree

changelog.d/19090.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix lost logcontext warnings from timeouts in sync and requests made by Synapse itself.

synapse/logging/context.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -896,7 +896,7 @@ def run_in_background(
896896
# If the function messes with logcontexts, we can assume it follows the Synapse
897897
# logcontext rules (Rules for functions returning awaitables: "If the awaitable
898898
# is already complete, the function returns with the same logcontext it started
899-
# with."). If it function doesn't touch logcontexts at all, we can also assume
899+
# with."). If the function doesn't touch logcontexts at all, we can also assume
900900
# the logcontext is unchanged.
901901
#
902902
# Either way, the function should have maintained the calling logcontext, so we
@@ -905,11 +905,21 @@ def run_in_background(
905905
# to reset the logcontext to the sentinel logcontext as that would run
906906
# immediately (remember our goal is to maintain the calling logcontext when we
907907
# return).
908-
logcontext_debug_logger.debug(
909-
"run_in_background(%s): deferred already completed and the function should have maintained the logcontext %s",
910-
instance_id,
911-
calling_context,
912-
)
908+
if current_context() != calling_context:
909+
logcontext_error(
910+
"run_in_background(%s): deferred already completed but the function did not maintain the calling logcontext %s (found %s)"
911+
% (
912+
instance_id,
913+
calling_context,
914+
current_context(),
915+
)
916+
)
917+
else:
918+
logcontext_debug_logger.debug(
919+
"run_in_background(%s): deferred already completed (maintained the calling logcontext %s)",
920+
instance_id,
921+
calling_context,
922+
)
913923
return d
914924

915925
# Since the function we called may follow the Synapse logcontext rules (Rules for

synapse/util/async_helpers.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -808,7 +808,8 @@ def time_it_out() -> None:
808808
timed_out[0] = True
809809

810810
try:
811-
deferred.cancel()
811+
with PreserveLoggingContext():
812+
deferred.cancel()
812813
except Exception: # if we throw any exception it'll break time outs
813814
logger.exception("Canceller failed during timeout")
814815

synapse/util/clock.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ def wrapped_callback(*args: Any, **kwargs: Any) -> None:
266266
# We use `PreserveLoggingContext` to prevent our new `call_later`
267267
# logcontext from finishing as soon as we exit this function, in case `f`
268268
# returns an awaitable/deferred which would continue running and may try to
269-
# restore the `loop_call` context when it's done (because it's trying to
269+
# restore the `call_later` context when it's done (because it's trying to
270270
# adhere to the Synapse logcontext rules.)
271271
#
272272
# This also ensures that we return to the `sentinel` context when we exit

tests/unittest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,9 @@ def logcontext_clean(target: TV) -> TV:
341341
"""
342342

343343
def logcontext_error(msg: str) -> NoReturn:
344+
# Log so we can still see it in the logs like normal
345+
logger.warning(msg)
346+
# But also fail the test
344347
raise AssertionError("logcontext error: %s" % (msg))
345348

346349
patcher = patch("synapse.logging.context.logcontext_error", new=logcontext_error)

tests/util/test_async_helpers.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
)
4646

4747
from tests.server import get_clock
48-
from tests.unittest import TestCase
48+
from tests.unittest import TestCase, logcontext_clean
4949

5050
logger = logging.getLogger(__name__)
5151

@@ -198,7 +198,12 @@ def canceller(_d: Deferred) -> None:
198198

199199
self.failureResultOf(timing_out_d, defer.TimeoutError)
200200

201-
async def test_logcontext_is_preserved_on_cancellation(self) -> None:
201+
@logcontext_clean
202+
async def test_logcontext_is_preserved_on_timeout_cancellation(self) -> None:
203+
"""
204+
Test that the logcontext is preserved when we timeout and the deferred is
205+
cancelled.
206+
"""
202207
# Sanity check that we start in the sentinel context
203208
self.assertEqual(current_context(), SENTINEL_CONTEXT)
204209

@@ -270,6 +275,65 @@ def mark_was_cancelled(res: Failure) -> None:
270275
# Back to the sentinel context
271276
self.assertEqual(current_context(), SENTINEL_CONTEXT)
272277

278+
@logcontext_clean
279+
async def test_logcontext_is_not_lost_when_awaiting_on_timeout_cancellation(
280+
self,
281+
) -> None:
282+
"""
283+
Test that the logcontext isn't lost when we `await make_deferred_yieldable(...)`
284+
the deferred to complete/timeout and it times out.
285+
"""
286+
287+
# Sanity check that we start in the sentinel context
288+
self.assertEqual(current_context(), SENTINEL_CONTEXT)
289+
290+
# Create a deferred which we will never complete
291+
incomplete_d: Deferred = Deferred()
292+
293+
async def competing_task() -> None:
294+
with LoggingContext(
295+
name="competing", server_name="test_server"
296+
) as context_competing:
297+
timing_out_d = timeout_deferred(
298+
deferred=incomplete_d,
299+
timeout=1.0,
300+
clock=self.clock,
301+
)
302+
self.assertNoResult(timing_out_d)
303+
# We should still be in the logcontext we started in
304+
self.assertIs(current_context(), context_competing)
305+
306+
# Mimic the normal use case to wait for the work to complete or timeout.
307+
#
308+
# In this specific test, we expect the deferred to timeout and raise an
309+
# exception at this point.
310+
await make_deferred_yieldable(timing_out_d)
311+
312+
self.fail(
313+
"We should not make it to this point as the `timing_out_d` should have been cancelled"
314+
)
315+
316+
d = defer.ensureDeferred(competing_task())
317+
318+
# Still in the sentinel context
319+
self.assertEqual(current_context(), SENTINEL_CONTEXT)
320+
321+
# Pump until we trigger the timeout
322+
self.reactor.pump(
323+
# We only need to pump `1.0` (seconds) as we set
324+
# `timeout_deferred(timeout=1.0)` above
325+
(1.0,)
326+
)
327+
328+
# Still in the sentinel context
329+
self.assertEqual(current_context(), SENTINEL_CONTEXT)
330+
331+
# We expect a failure due to the timeout
332+
self.failureResultOf(d, defer.TimeoutError)
333+
334+
# Back to the sentinel context at the end of the day
335+
self.assertEqual(current_context(), SENTINEL_CONTEXT)
336+
273337

274338
class _TestException(Exception): #
275339
pass

0 commit comments

Comments
 (0)