Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/19416.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix memory leak caused by not cleaning up stopped looping calls. Introduced in v1.140.0.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you stumble upon this?

Investigation from seeing some memory graph?

Copy link
Copy Markdown
Member

@devonh devonh Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know it started with seeing the matrix.org client_readers take a long time to do GC, resulting in them appearing down frequently, https://github.com/matrix-org/internal-config/issues/1677
Then looking into why their memory usage was so high and kept growing.
Also, looking at this issue from the other day: #19392

I'm not sure how he jumped from there to looking at looping_calls.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The basic steps were:

  1. Check the graphs to see if there was any obvious correlation, e.g. large cache size.
  2. Manhole into one of the workers and manually clear out all the caches, this did not help.
  3. Install objgraph in the venv and run objgraph.show_most_common_types(), this came back with a list of cell, function (etc)... and then LoopingCall.
  4. At that point I had a look around trying to find where we keep references to looping calls, I just happened to stumble on Clock._looping_calls. One can also use objgraph to generate graphs of references of objects to see what is holding on to the references, which also would have pointed to Clock._looping_calls.

8 changes: 6 additions & 2 deletions synapse/util/clock.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@


import logging
from functools import wraps
from typing import (
Any,
Callable,
)
from weakref import WeakSet

from typing_extensions import ParamSpec
from zope.interface import implementer
Expand Down Expand Up @@ -86,7 +88,7 @@ def __init__(self, reactor: ISynapseThreadlessReactor, server_name: str) -> None
self._delayed_call_id: int = 0
"""Unique ID used to track delayed calls"""

self._looping_calls: list[LoopingCall] = []
self._looping_calls: WeakSet[LoopingCall] = WeakSet()
"""List of active looping calls"""

self._call_id_to_delayed_call: dict[int, IDelayedCall] = {}
Expand Down Expand Up @@ -193,6 +195,7 @@ def _looping_call_common(
if now:
looping_call_context_string = "looping_call_now"

@wraps(f)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Neat. TIL about @wraps

def wrapped_f(*args: P.args, **kwargs: P.kwargs) -> Deferred:
clock_debug_logger.debug(
"%s(%s): Executing callback", looping_call_context_string, instance_id
Expand Down Expand Up @@ -240,7 +243,7 @@ def wrapped_f(*args: P.args, **kwargs: P.kwargs) -> Deferred:
with context.PreserveLoggingContext():
d = call.start(duration.as_secs(), now=now)
d.addErrback(log_failure, "Looping call died", consumeErrors=False)
self._looping_calls.append(call)
self._looping_calls.add(call)

clock_debug_logger.debug(
"%s(%s): Scheduled looping call every %sms later",
Expand Down Expand Up @@ -302,6 +305,7 @@ def call_later(
if self._is_shutdown:
raise Exception("Cannot start delayed call. Clock has been shutdown")

@wraps(callback)
def wrapped_callback(*args: Any, **kwargs: Any) -> None:
clock_debug_logger.debug("call_later(%s): Executing callback", call_id)

Expand Down
77 changes: 77 additions & 0 deletions tests/util/test_clock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# This file is licensed under the Affero General Public License (AGPL) version 3.
#
# Copyright (C) 2025 Element Creations Ltd
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# See the GNU Affero General Public License for more details:
# <https://www.gnu.org/licenses/agpl-3.0.html>.
#
#

import weakref

from synapse.util.duration import Duration

from tests.unittest import HomeserverTestCase


class ClockTestCase(HomeserverTestCase):
def test_looping_calls_are_gced(self) -> None:
"""Test that looping calls are garbage collected after being stopped.

The `Clock` tracks looping calls so to allow stopping of all looping
calls via the clock.
"""
clock = self.hs.get_clock()

# Create a new looping call, and take a weakref to it.
call = clock.looping_call(lambda: None, Duration(seconds=1))

weak_call = weakref.ref(call)

# Stop the looping call. It should get garbage collected after this.
call.stop()

# Delete our strong reference to the call (otherwise it won't get garbage collected).
del call

# Check that the call has been garbage collected.
self.assertIsNone(weak_call())

def test_looping_calls_stopped_on_clock_shutdown(self) -> None:
"""Test that looping calls are stopped when the clock is shut down."""
clock = self.hs.get_clock()

was_called = False

def on_call() -> None:
nonlocal was_called
was_called = True

# Create a new looping call.
call = clock.looping_call(on_call, Duration(seconds=1))
weak_call = weakref.ref(call)
del call # Remove our strong reference to the call.

# The call should still exist.
self.assertIsNotNone(weak_call())

# Advance the clock to trigger the call.
self.reactor.advance(2)
self.assertTrue(was_called)

# Shut down the clock, which should stop the looping call.
clock.shutdown()

# The call should have been garbage collected.
self.assertIsNone(weak_call())

# Advance the clock again; the call should not be called again.
was_called = False
self.reactor.advance(2)
self.assertFalse(was_called)
Loading