Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0480911
First pass: be able to shutdown homeserver that hasn't `setup`
MadLittleMods Nov 14, 2025
a0e7698
Add test
MadLittleMods Nov 14, 2025
df0a5d1
Add some better debugging info when the test fails
MadLittleMods Nov 14, 2025
d046476
Fix up lints
MadLittleMods Nov 14, 2025
d55f86d
Add changelog
MadLittleMods Nov 14, 2025
fa83d65
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-wa…
MadLittleMods Nov 21, 2025
ea1757e
Avoid partially initialized `Keyring` which can hold references to `hs`
MadLittleMods Nov 22, 2025
fce7ada
Better comment
MadLittleMods Nov 22, 2025
b99246b
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-wa…
MadLittleMods Nov 22, 2025
b3aab27
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-wa…
MadLittleMods Nov 24, 2025
72ca424
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-wa…
MadLittleMods Nov 25, 2025
f51e1fe
Fix Synapse unable to shutdown after failing to bind ports during `st…
MadLittleMods Nov 26, 2025
c69397b
Try and figure out why it works on successful case but not error case
MadLittleMods Nov 26, 2025
f2af1d7
Revert "Try and figure out why it works on successful case but not er…
MadLittleMods Nov 26, 2025
cc26fd1
Iterate on test
MadLittleMods Nov 26, 2025
34b1878
Try to start with real invalid ports
MadLittleMods Nov 26, 2025
270aaba
Add changelog
MadLittleMods Nov 26, 2025
e121d66
Use `site.stopFactory()`
MadLittleMods Nov 26, 2025
e49fed1
Remove debug logs
MadLittleMods Nov 26, 2025
d8f4a00
Explain more why
MadLittleMods Nov 26, 2025
422f36d
Remove test as its not effective
MadLittleMods Nov 26, 2025
5729c13
Better language, only
MadLittleMods Nov 26, 2025
2551c24
Better errors
MadLittleMods Nov 26, 2025
ebf3c6e
Remove unused `MemoryReactor` changes now that we don't have a test t…
MadLittleMods Nov 26, 2025
591d920
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-wa…
MadLittleMods Dec 1, 2025
235b684
Use `ExitStack` for nice clean-up during `__init__`
MadLittleMods Dec 1, 2025
25015f7
`self._key_fetchers.clear()`
MadLittleMods Dec 1, 2025
858c2ba
Remove irrelevant comment
MadLittleMods Dec 1, 2025
fe3a84a
Fix `which` typo
MadLittleMods Dec 1, 2025
df97f11
Explain `ExitStack` a little
MadLittleMods Dec 1, 2025
b430e81
Merge branch 'madlittlemods/shutdown-homeserver-that-was-never-setup'…
MadLittleMods Dec 1, 2025
238c936
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-wa…
MadLittleMods Dec 2, 2025
382bbcd
Merge branch 'madlittlemods/shutdown-homeserver-that-was-never-setup'…
MadLittleMods Dec 2, 2025
d0d2802
Merge branch 'develop' into madlittlemods/shutdown-homeserver-that-fa…
MadLittleMods Dec 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/19187.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `HomeServer.shutdown()` failing if the homeserver hasn't been setup yet.
1 change: 1 addition & 0 deletions changelog.d/19232.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `HomeServer.shutdown()` failing if the homeserver failed to `start`.
6 changes: 6 additions & 0 deletions synapse/api/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,6 +856,12 @@ def to_synapse_error(self) -> SynapseError:
return ProxiedRequestError(self.code, errmsg, errcode, j)


class HomeServerNotSetupException(Exception):
"""
Raised when an operation is attempted on the HomeServer before setup() has been called.
"""


class ShadowBanError(Exception):
"""
Raised when a shadow-banned user attempts to perform an action.
Expand Down
74 changes: 44 additions & 30 deletions synapse/app/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,39 +447,53 @@ def listen_http(
hs=hs,
)

if isinstance(listener_config, TCPListenerConfig):
if listener_config.is_tls():
# refresh_certificate should have been called before this.
assert context_factory is not None
ports = listen_ssl(
listener_config.bind_addresses,
listener_config.port,
site,
context_factory,
reactor=reactor,
try:
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best to review this part with the "Hide whitespace" option when viewing the diff

if isinstance(listener_config, TCPListenerConfig):
if listener_config.is_tls():
# refresh_certificate should have been called before this.
assert context_factory is not None
ports = listen_ssl(
listener_config.bind_addresses,
listener_config.port,
site,
context_factory,
reactor=reactor,
)
logger.info(
"Synapse now listening on TCP port %d (TLS)", listener_config.port
)
else:
ports = listen_tcp(
listener_config.bind_addresses,
listener_config.port,
site,
reactor=reactor,
)
logger.info(
"Synapse now listening on TCP port %d", listener_config.port
)

else:
ports = listen_unix(
listener_config.path, listener_config.mode, site, reactor=reactor
)
# getHost() returns a UNIXAddress which contains an instance variable of 'name'
# encoded as a byte string. Decode as utf-8 so pretty.
logger.info(
"Synapse now listening on TCP port %d (TLS)", listener_config.port
)
else:
ports = listen_tcp(
listener_config.bind_addresses,
listener_config.port,
site,
reactor=reactor,
"Synapse now listening on Unix Socket at: %s",
ports[0].getHost().name.decode("utf-8"),
)
logger.info("Synapse now listening on TCP port %d", listener_config.port)

else:
ports = listen_unix(
listener_config.path, listener_config.mode, site, reactor=reactor
)
# getHost() returns a UNIXAddress which contains an instance variable of 'name'
# encoded as a byte string. Decode as utf-8 so pretty.
logger.info(
"Synapse now listening on Unix Socket at: %s",
ports[0].getHost().name.decode("utf-8"),
)
except Exception as exc:
# The Twisted interface says that "Users should not call this function
# themselves!" but this appears to be the correct way handle proper cleanup of
# the site when things go wrong. In the normal case, a `Port` is created which
# we can call `Port.stopListening()` on to do the same thing (but no `Port` is
# created when an error occurs).
#
# We use `site.stopFactory()` instead of `site.doStop()` as the latter assumes
# that `site.doStart()` was called (which won't be the case if an error occurs).
site.stopFactory()
raise Exception("asdf failed to listen") from exc

return ports

Expand Down
153 changes: 104 additions & 49 deletions synapse/crypto/keyring.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import abc
import logging
from typing import TYPE_CHECKING, Callable, Iterable
from typing import TYPE_CHECKING, Callable, Iterable, Sequence

import attr
from signedjson.key import (
Expand Down Expand Up @@ -150,57 +150,77 @@ class Keyring:
"""

def __init__(
self, hs: "HomeServer", key_fetchers: "Iterable[KeyFetcher] | None" = None
self,
hs: "HomeServer",
test_only_key_fetchers: "Sequence[KeyFetcher] | None" = None,
):
self.server_name = hs.hostname
"""
Args:
hs: The HomeServer instance
test_only_key_fetchers: Dependency injection for tests only. If provided,
these key fetchers will be used instead of the default ones.
"""

if key_fetchers is None:
# Always fetch keys from the database.
mutable_key_fetchers: list[KeyFetcher] = [StoreKeyFetcher(hs)]
# Fetch keys from configured trusted key servers, if any exist.
key_servers = hs.config.key.key_servers
if key_servers:
mutable_key_fetchers.append(PerspectivesKeyFetcher(hs))
# Finally, fetch keys from the origin server directly.
mutable_key_fetchers.append(ServerKeyFetcher(hs))

self._key_fetchers: Iterable[KeyFetcher] = tuple(mutable_key_fetchers)
else:
self._key_fetchers = key_fetchers

self._fetch_keys_queue: BatchingQueue[
_FetchKeyRequest, dict[str, dict[str, FetchKeyResult]]
] = BatchingQueue(
name="keyring_server",
hs=hs,
clock=hs.get_clock(),
# The method called to fetch each key
process_batch_callback=self._inner_fetch_key_requests,
)
try:
self.server_name = hs.hostname

self._key_fetchers: Sequence[KeyFetcher] = []
if test_only_key_fetchers is None:
# Always fetch keys from the database.
self._key_fetchers.append(StoreKeyFetcher(hs))
# Fetch keys from configured trusted key servers, if any exist.
key_servers = hs.config.key.key_servers
if key_servers:
self._key_fetchers.append(PerspectivesKeyFetcher(hs))
# Finally, fetch keys from the origin server directly.
self._key_fetchers.append(ServerKeyFetcher(hs))
else:
self._key_fetchers = test_only_key_fetchers

self._fetch_keys_queue: BatchingQueue[
_FetchKeyRequest, dict[str, dict[str, FetchKeyResult]]
] = BatchingQueue(
name="keyring_server",
hs=hs,
clock=hs.get_clock(),
# The method called to fetch each key
process_batch_callback=self._inner_fetch_key_requests,
)

self._is_mine_server_name = hs.is_mine_server_name
self._is_mine_server_name = hs.is_mine_server_name

# build a FetchKeyResult for each of our own keys, to shortcircuit the
# fetcher.
self._local_verify_keys: dict[str, FetchKeyResult] = {}
for key_id, key in hs.config.key.old_signing_keys.items():
self._local_verify_keys[key_id] = FetchKeyResult(
verify_key=key, valid_until_ts=key.expired
)
# build a FetchKeyResult for each of our own keys, to shortcircuit the
# fetcher.
self._local_verify_keys: dict[str, FetchKeyResult] = {}
for key_id, key in hs.config.key.old_signing_keys.items():
self._local_verify_keys[key_id] = FetchKeyResult(
verify_key=key, valid_until_ts=key.expired
)

vk = get_verify_key(hs.signing_key)
self._local_verify_keys[f"{vk.alg}:{vk.version}"] = FetchKeyResult(
verify_key=vk,
valid_until_ts=2**63, # fake future timestamp
)
vk = get_verify_key(hs.signing_key)
self._local_verify_keys[f"{vk.alg}:{vk.version}"] = FetchKeyResult(
verify_key=vk,
valid_until_ts=2**63, # fake future timestamp
)
except Exception:
self.shutdown()
raise

def shutdown(self) -> None:
"""
Prepares the KeyRing for garbage collection by shutting down it's queues.

This needs to be robust enough to be called even if `__init__` failed partway
through.
"""
self._fetch_keys_queue.shutdown()
for key_fetcher in self._key_fetchers:
key_fetcher.shutdown()
_fetch_keys_queue = getattr(self, "_fetch_keys_queue", None)
if _fetch_keys_queue:
_fetch_keys_queue.shutdown()

_key_fetchers = getattr(self, "_key_fetchers", None)
if _key_fetchers:
for key_fetcher in _key_fetchers:
key_fetcher.shutdown()

async def verify_json_for_server(
self,
Expand Down Expand Up @@ -495,8 +515,13 @@ def __init__(self, hs: "HomeServer"):
def shutdown(self) -> None:
"""
Prepares the KeyFetcher for garbage collection by shutting down it's queue.

This needs to be robust enough to be called even if `__init__` failed partway
through.
"""
self._queue.shutdown()
_queue = getattr(self, "_queue", None)
if _queue:
_queue.shutdown()

async def get_keys(
self, server_name: str, key_ids: list[str], minimum_valid_until_ts: int
Expand All @@ -521,9 +546,24 @@ class StoreKeyFetcher(KeyFetcher):
"""KeyFetcher impl which fetches keys from our data store"""

def __init__(self, hs: "HomeServer"):
super().__init__(hs)

self.store = hs.get_datastores().main
try:
super().__init__(hs)

self.store = hs.get_datastores().main

# `KeyFetcher` keeps a reference to `hs` which we need to clean up if something
# goes wrong so we can cleanly shutdown the homeserver.
#
# If something goes wrong while initializing the `KeyFetcher`, the caller won't
# be able to call shutdown on it because there won't be a reference to the
# `KeyFetcher`.
#
# An error can occur here if someone tries to create a `KeyFetcher` before the
# homeserver is fully set up (`HomeServerNotSetupException: HomeServer.setup
# must be called before getting datastores`).
except Exception:
self.shutdown()
raise

async def _fetch_keys(
self, keys_to_fetch: list[_FetchKeyRequest]
Expand All @@ -543,9 +583,24 @@ async def _fetch_keys(

class BaseV2KeyFetcher(KeyFetcher):
def __init__(self, hs: "HomeServer"):
super().__init__(hs)

self.store = hs.get_datastores().main
try:
super().__init__(hs)

self.store = hs.get_datastores().main

# `KeyFetcher` keeps a reference to `hs` which we need to clean up if something
# goes wrong so we can cleanly shutdown the homeserver.
#
# If something goes wrong while initializing the `KeyFetcher`, the caller won't
# be able to call shutdown on it because there won't be a reference to the
# `KeyFetcher`.
#
# An error can occur here if someone tries to create a `KeyFetcher` before the
# homeserver is fully set up (`HomeServerNotSetupException: HomeServer.setup
# must be called before getting datastores`).
except Exception:
self.shutdown()
raise

async def process_v2_response(
self, from_server: str, response_json: JsonDict, time_added_ms: int
Expand Down
7 changes: 7 additions & 0 deletions synapse/http/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,13 @@ def stopFactory(self) -> None:
protocol.transport.loseConnection()
self.connections.clear()

# Replace the resource tree with an empty resource to break circular references
# to the resource tree which holds a bunch of homeserver references. This is
# important if we try to call `hs.shutdown()` after `start` fails. For some
# reason, this doesn't seem to be necessary in the normal case where `start`
# succeeds and we call `hs.shutdown()` later.
self.resource = Resource()
Comment on lines +818 to +823
Copy link
Copy Markdown
Contributor Author

@MadLittleMods MadLittleMods Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've spent too long trying to figure out why this works exactly. Specifically, why the normal case works without this change but the error case requires it.

The internal references to hs should be the same between the normal and error cases. And we have even less external references to SynapseSite since it just gets orphaned in this function context in the error case and drops away as soon as we handle the traceback/exception in the layers above.

In the normal case, Port holds a reference to the site and then when we call Port.stopListening(), it calls site.stopFactory() down the line just like we're doing in the error case now. But in the error case, it doesn't work without this additional change to sever the circular references.

I've looked through the Twisted internals to try to spot the difference but was unsuccessful. Also tried throwing an LLM at the problem but they were also unable to spot anything.

In any case, clearing circular references in these kinds of callbacks are pretty normal. For example, it's even called out in the HTTPChannel.connectionLost docstring. twisted.web.server.Site.stopFactory describes it as:

This can be overridden to perform 'shutdown' tasks such as disconnecting database connections, closing files, etc.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to have this + the comment saying we don't understand why it's necessary only sometimes.

Replacing your inners with empty values on shutdown/destruction is totally fine as a practice to me.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I'm assuming the current comment is sufficient)


def log(self, request: SynapseRequest) -> None: # type: ignore[override]
pass

Expand Down
35 changes: 28 additions & 7 deletions synapse/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from synapse.api.auth.internal import InternalAuth
from synapse.api.auth.mas import MasDelegatedAuth
from synapse.api.auth_blocking import AuthBlocking
from synapse.api.errors import HomeServerNotSetupException
from synapse.api.filtering import Filtering
from synapse.api.ratelimiting import Ratelimiter, RequestRatelimiter
from synapse.app._base import unregister_sighups
Expand Down Expand Up @@ -399,7 +400,7 @@ def run_as_background_process(
"""
if self._is_shutdown:
raise Exception(
f"Cannot start background process. HomeServer has been shutdown {len(self._background_processes)} {len(self.get_clock()._looping_calls)} {len(self.get_clock()._call_id_to_delayed_call)}"
"Cannot start background process. HomeServer has been shutdown"
)

# Ignore linter error as this is the one location this should be called.
Expand Down Expand Up @@ -466,7 +467,17 @@ async def shutdown(self) -> None:

# TODO: Cleanup replication pieces

self.get_keyring().shutdown()
keyring: Keyring | None = None
try:
keyring = self.get_keyring()
except HomeServerNotSetupException:
# If the homeserver wasn't fully setup, keyring won't have existed before
# this and will fail to be initialized but it cleans itself up for any
# partial initialization problem.
pass

if keyring:
keyring.shutdown()

# Cleanup metrics associated with the homeserver
for later_gauge in all_later_gauges_to_clean_up_on_shutdown.values():
Expand All @@ -478,8 +489,12 @@ async def shutdown(self) -> None:
self.config.server.server_name
)

for db in self.get_datastores().databases:
db.stop_background_updates()
try:
for db in self.get_datastores().databases:
db.stop_background_updates()
except HomeServerNotSetupException:
# If the homeserver wasn't fully setup, the datastores won't exist
pass

if self.should_send_federation():
try:
Expand Down Expand Up @@ -513,8 +528,12 @@ async def shutdown(self) -> None:
pass
self._background_processes.clear()

for db in self.get_datastores().databases:
db._db_pool.close()
try:
for db in self.get_datastores().databases:
db._db_pool.close()
except HomeServerNotSetupException:
# If the homeserver wasn't fully setup, the datastores won't exist
pass

def register_async_shutdown_handler(
self,
Expand Down Expand Up @@ -677,7 +696,9 @@ def get_clock(self) -> Clock:

def get_datastores(self) -> Databases:
if not self.datastores:
raise Exception("HomeServer.setup must be called before getting datastores")
raise HomeServerNotSetupException(
"HomeServer.setup must be called before getting datastores"
)

return self.datastores

Expand Down
Loading
Loading