From b14eed7b80c86a02362132f8a7d382735d96b7ca Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 14:49:17 -0500 Subject: [PATCH 01/23] Add in base linting for metrics From https://github.com/element-hq/synapse/pull/18656 --- scripts-dev/mypy_synapse_plugin.py | 107 ++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 2 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index a15c3c005cf..3657ccadbf4 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -28,8 +28,13 @@ import mypy.types from mypy.erasetype import remove_instance_last_known_values from mypy.errorcodes import ErrorCode -from mypy.nodes import ARG_NAMED_OPT, TempNode, Var -from mypy.plugin import FunctionSigContext, MethodSigContext, Plugin +from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, Var +from mypy.plugin import ( + FunctionLike, + FunctionSigContext, + MethodSigContext, + Plugin, +) from mypy.typeops import bind_self from mypy.types import ( AnyType, @@ -43,11 +48,30 @@ UnionType, ) +PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode( + "missing-server-name-label", + "`SERVER_NAME_LABEL` required in metric", + category="per-homeserver-tenant-metrics", +) + class SynapsePlugin(Plugin): + def get_function_signature_hook( + self, fullname: str + ) -> Optional[Callable[[FunctionSigContext], FunctionLike]]: + if fullname in ( + "prometheus_client.metrics.Gauge", + # TODO: Add other prometheus_client metrics that need checking as we + # refactor, see https://github.com/element-hq/synapse/issues/18592 + ): + return check_prometheus_metric_instantiation + + return None + def get_method_signature_hook( self, fullname: str ) -> Optional[Callable[[MethodSigContext], CallableType]]: + # print(f"m fullname={fullname}") if fullname.startswith( ( "synapse.util.caches.descriptors.CachedFunction.__call__", @@ -65,6 +89,85 @@ def get_method_signature_hook( return None +def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType: + """ + Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label + when instantiated. + + This is important because we support multiple Synapse instances running in the same + process, where all metrics share a single global `REGISTRY`. The `server_name` label + ensures metrics are correctly separated by homeserver. + + There are also some metrics that apply at the process level, such as CPU usage, + Python garbage collection, Twisted reactor tick time which shouldn't have the + `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the + check, e.g. `# type: ignore[missing-server-name-label]`. + """ + # The true signature, this isn't being modified so this is what will be returned. + signature: CallableType = ctx.default_signature + + # Sanity check the arguments are still as expected in this version of + # `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)` + # + # `signature.arg_names` should be: ["name", "documentation", "labelnames", ...] + if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames": + ctx.api.fail( + f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got " + f"{signature.arg_names[2]}", + ctx.context, + ) + return signature + + # Ensure mypy is passing the correct number of arguments because we are doing some + # dirty indexing into `ctx.args` later on. + assert len(ctx.args) == len(signature.arg_names), ( + f"Expected the list of arguments in the {signature.name} signature ({len(signature.arg_names)})" + f"to match the number of arguments from the function signature context ({len(ctx.args)})" + ) + + # Check if the `labelnames` argument includes `SERVER_NAME_LABEL` + # + # `ctx.args` should look like this: + # ``` + # [ + # [StrExpr("name")], + # [StrExpr("documentation")], + # [ListExpr([StrExpr("label1"), StrExpr("label2")])] + # ... + # ] + # ``` + labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None + if isinstance(labelnames_arg_expression, ListExpr): + # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). + for labelname_expression in labelnames_arg_expression.items: + if ( + isinstance(labelname_expression, NameExpr) + and labelname_expression.fullname == "synapse.metrics.SERVER_NAME_LABEL" + ): + # Found the `SERVER_NAME_LABEL`, all good! + break + else: + ctx.api.fail( + f"Expected {signature.name} to include `SERVER_NAME_LABEL` in the list of labels. " + "If this is a process-level metric (vs homeserver-level), use a type ignore comment " + "to disable this check.", + ctx.context, + code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, + ) + else: + ctx.api.fail( + f"Expected the `labelnames` argument of {signature.name} to be a list of label names " + f"(including `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. " + "If this is a process-level metric (vs homeserver-level), use a type ignore comment " + "to disable this check.", + ctx.context, + code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL, + ) + return signature + + return signature + + def _get_true_return_type(signature: CallableType) -> mypy.types.Type: """ Get the "final" return type of a callable which might return an Awaitable/Deferred. From 4bb84a604cd2577bee3a41672da8d57be43c27eb Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:14:35 -0500 Subject: [PATCH 02/23] Fill in `synapse/app/phone_stats_home.py` --- synapse/app/phone_stats_home.py | 34 +++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/synapse/app/phone_stats_home.py b/synapse/app/phone_stats_home.py index 57b6db6ea5d..69d3ac78fd4 100644 --- a/synapse/app/phone_stats_home.py +++ b/synapse/app/phone_stats_home.py @@ -28,6 +28,7 @@ from twisted.internet import defer +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import ( run_as_background_process, ) @@ -57,16 +58,25 @@ _stats_process: List[Tuple[int, "resource.struct_rusage"]] = [] # Gauges to expose monthly active user control metrics -current_mau_gauge = Gauge("synapse_admin_mau_current", "Current MAU") +current_mau_gauge = Gauge( + "synapse_admin_mau_current", + "Current MAU", + labelnames=[SERVER_NAME_LABEL], +) current_mau_by_service_gauge = Gauge( "synapse_admin_mau_current_mau_by_service", "Current MAU by service", - ["app_service"], + labelnames=["app_service", SERVER_NAME_LABEL], +) +max_mau_gauge = Gauge( + "synapse_admin_mau_max", + "MAU Limit", + labelnames=[SERVER_NAME_LABEL], ) -max_mau_gauge = Gauge("synapse_admin_mau_max", "MAU Limit") registered_reserved_users_mau_gauge = Gauge( "synapse_admin_mau_registered_reserved_users", "Registered users with reserved threepids", + labelnames=[SERVER_NAME_LABEL], ) @@ -237,13 +247,21 @@ async def _generate_monthly_active_users() -> None: await store.get_monthly_active_count_by_service() ) reserved_users = await store.get_registered_reserved_users() - current_mau_gauge.set(float(current_mau_count)) + current_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set( + float(current_mau_count) + ) for app_service, count in current_mau_count_by_service.items(): - current_mau_by_service_gauge.labels(app_service).set(float(count)) - - registered_reserved_users_mau_gauge.set(float(len(reserved_users))) - max_mau_gauge.set(float(hs.config.server.max_mau_value)) + current_mau_by_service_gauge.labels( + app_service=app_service, **{SERVER_NAME_LABEL: server_name} + ).set(float(count)) + + registered_reserved_users_mau_gauge.labels( + **{SERVER_NAME_LABEL: server_name} + ).set(float(len(reserved_users))) + max_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set( + float(hs.config.server.max_mau_value) + ) return run_as_background_process( "generate_monthly_active_users", From 80d5fd51bc8dcc86b75058fd1ef6f72eec33c746 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 15:24:25 -0500 Subject: [PATCH 03/23] Support `labelnames` argument being a Tuple expression ``` synapse/metrics/__init__.py:522: error: Expected the `labelnames` argument of Histogram to be a list of label names (including `SERVER_NAME_LABEL`), but got TupleExpr:528( StrExpr(type) StrExpr(reason) NameExpr(SERVER_NAME_LABEL [synapse.metrics.SERVER_NAME_LABEL])). If this is a process-level metric (vs homeserver-level), use a type ignore comment to disable this check. [missing-server-name-label] ``` --- scripts-dev/mypy_synapse_plugin.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 3657ccadbf4..fbb46c9f87e 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -28,7 +28,7 @@ import mypy.types from mypy.erasetype import remove_instance_last_known_values from mypy.errorcodes import ErrorCode -from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, Var +from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, TupleExpr, Var from mypy.plugin import ( FunctionLike, FunctionSigContext, @@ -137,7 +137,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy # ] # ``` labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None - if isinstance(labelnames_arg_expression, ListExpr): + if isinstance(labelnames_arg_expression, (ListExpr, TupleExpr)): # Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`). for labelname_expression in labelnames_arg_expression.items: if ( From 1504100992c42f110634cdd627a52d6488b3fff5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:19:54 -0500 Subject: [PATCH 04/23] Fill in `synapse/federation/federation_server.py` --- contrib/grafana/synapse.json | 2 +- synapse/federation/federation_server.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/contrib/grafana/synapse.json b/contrib/grafana/synapse.json index 62b58a199d2..d12bc5583ba 100644 --- a/contrib/grafana/synapse.json +++ b/contrib/grafana/synapse.json @@ -4396,7 +4396,7 @@ "exemplar": false, "expr": "(time() - max without (job, index, host) (avg_over_time(synapse_federation_last_received_pdu_time[10m]))) / 60", "instant": false, - "legendFormat": "{{server_name}} ", + "legendFormat": "{{origin_server_name}} ", "range": true, "refId": "A" } diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index 3e6b8b84936..c845c65acd2 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -82,6 +82,7 @@ tag_args, trace, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.replication.http.federation import ( ReplicationFederationSendEduRestServlet, @@ -120,7 +121,7 @@ last_pdu_ts_metric = Gauge( "synapse_federation_last_received_pdu_time", "The timestamp of the last PDU which was successfully received from the given domain", - labelnames=("server_name",), + labelnames=("origin_server_name", SERVER_NAME_LABEL), ) @@ -545,7 +546,9 @@ async def process_pdu(pdu: EventBase) -> JsonDict: ) if newest_pdu_ts and origin in self._federation_metrics_domains: - last_pdu_ts_metric.labels(server_name=origin).set(newest_pdu_ts / 1000) + last_pdu_ts_metric.labels( + origin_server_name=origin, **{SERVER_NAME_LABEL: self.server_name} + ).set(newest_pdu_ts / 1000) return pdu_results From 608f72e04cf868c4cbc700fd465d82546805e299 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:20:13 -0500 Subject: [PATCH 05/23] Fill in `synapse/federation/sender/transaction_manager.py` --- contrib/grafana/synapse.json | 2 +- synapse/federation/sender/transaction_manager.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/contrib/grafana/synapse.json b/contrib/grafana/synapse.json index d12bc5583ba..e23afcf2d30 100644 --- a/contrib/grafana/synapse.json +++ b/contrib/grafana/synapse.json @@ -4518,7 +4518,7 @@ "exemplar": false, "expr": "(time() - max without (job, index, host) (avg_over_time(synapse_federation_last_sent_pdu_time[10m]))) / 60", "instant": false, - "legendFormat": "{{server_name}}", + "legendFormat": "{{destination_server_name}}", "range": true, "refId": "A" } diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py index 21e2fed085d..63ed13c6fa6 100644 --- a/synapse/federation/sender/transaction_manager.py +++ b/synapse/federation/sender/transaction_manager.py @@ -34,6 +34,7 @@ tags, whitelisted_homeserver, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import JsonDict from synapse.util import json_decoder from synapse.util.metrics import measure_func @@ -47,7 +48,7 @@ last_pdu_ts_metric = Gauge( "synapse_federation_last_sent_pdu_time", "The timestamp of the last PDU which was successfully sent to the given domain", - labelnames=("server_name",), + labelnames=("destination_server_name", SERVER_NAME_LABEL), ) @@ -191,6 +192,7 @@ def json_data_cb() -> JsonDict: if pdus and destination in self._federation_metrics_domains: last_pdu = pdus[-1] - last_pdu_ts_metric.labels(server_name=destination).set( - last_pdu.origin_server_ts / 1000 - ) + last_pdu_ts_metric.labels( + destination_server_name=destination, + **{SERVER_NAME_LABEL: self.server_name}, + ).set(last_pdu.origin_server_ts / 1000) From 0a2877e64470ea9c1d6c51c648de47706874cdfc Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:30:17 -0500 Subject: [PATCH 06/23] Fill in `synapse/metrics/__init__.py` --- synapse/app/_base.py | 8 +++-- synapse/metrics/__init__.py | 59 ++++++++++++++++++++++++++----------- synapse/server.py | 5 +++- synapse/storage/database.py | 15 ++++++++-- tests/server.py | 4 ++- 5 files changed, 68 insertions(+), 23 deletions(-) diff --git a/synapse/app/_base.py b/synapse/app/_base.py index 6b62d37dca9..48989540bb2 100644 --- a/synapse/app/_base.py +++ b/synapse/app/_base.py @@ -525,8 +525,12 @@ async def start(hs: "HomeServer") -> None: ) # Register the threadpools with our metrics. - register_threadpool("default", reactor.getThreadPool()) - register_threadpool("gai_resolver", resolver_threadpool) + register_threadpool( + name="default", server_name=server_name, threadpool=reactor.getThreadPool() + ) + register_threadpool( + name="gai_resolver", server_name=server_name, threadpool=resolver_threadpool + ) # Set up the SIGHUP machinery. if hasattr(signal, "SIGHUP"): diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index de750a5de2d..a7804ea5976 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -488,19 +488,27 @@ def collect(self) -> Iterable[Metric]: # Used to track where various components have processed in the event stream, # e.g. federation sending, appservice sending, etc. -event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"]) +event_processing_positions = Gauge( + "synapse_event_processing_positions", "", labelnames=["name", SERVER_NAME_LABEL] +) # Used to track the current max events stream position -event_persisted_position = Gauge("synapse_event_persisted_position", "") +event_persisted_position = Gauge( + "synapse_event_persisted_position", "", labelnames=[SERVER_NAME_LABEL] +) # Used to track the received_ts of the last event processed by various # components -event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"]) +event_processing_last_ts = Gauge( + "synapse_event_processing_last_ts", "", labelnames=["name", SERVER_NAME_LABEL] +) # Used to track the lag processing events. This is the time difference # between the last processed event's received_ts and the time it was # finished being processed. -event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"]) +event_processing_lag = Gauge( + "synapse_event_processing_lag", "", labelnames=["name", SERVER_NAME_LABEL] +) event_processing_lag_by_event = Histogram( "synapse_event_processing_lag_by_event", @@ -509,7 +517,11 @@ def collect(self) -> Iterable[Metric]: ) # Build info of the running server. -build_info = Gauge( +# +# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. We +# consider this process-level because all Synapse homeservers running in the process +# will use the same Synapse version. +build_info = Gauge( # type: ignore[missing-server-name-label] "synapse_build_info", "Build information", ["pythonversion", "version", "osversion"] ) build_info.labels( @@ -531,38 +543,51 @@ def collect(self) -> Iterable[Metric]: threadpool_total_threads = Gauge( "synapse_threadpool_total_threads", "Total number of threads currently in the threadpool", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) threadpool_total_working_threads = Gauge( "synapse_threadpool_working_threads", "Number of threads currently working in the threadpool", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) threadpool_total_min_threads = Gauge( "synapse_threadpool_min_threads", "Minimum number of threads configured in the threadpool", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) threadpool_total_max_threads = Gauge( "synapse_threadpool_max_threads", "Maximum number of threads configured in the threadpool", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) -def register_threadpool(name: str, threadpool: ThreadPool) -> None: - """Add metrics for the threadpool.""" +def register_threadpool(*, name: str, server_name: str, threadpool: ThreadPool) -> None: + """ + Add metrics for the threadpool. - threadpool_total_min_threads.labels(name).set(threadpool.min) - threadpool_total_max_threads.labels(name).set(threadpool.max) + Args: + name: The name of the threadpool, used to identify it in the metrics. + server_name: The homeserver name (used to label metrics) (this should be `hs.hostname`). + threadpool: The threadpool to register metrics for. + """ - threadpool_total_threads.labels(name).set_function(lambda: len(threadpool.threads)) - threadpool_total_working_threads.labels(name).set_function( - lambda: len(threadpool.working) - ) + threadpool_total_min_threads.labels( + name=name, **{SERVER_NAME_LABEL: server_name} + ).set(threadpool.min) + threadpool_total_max_threads.labels( + name=name, **{SERVER_NAME_LABEL: server_name} + ).set(threadpool.max) + + threadpool_total_threads.labels( + name=name, **{SERVER_NAME_LABEL: server_name} + ).set_function(lambda: len(threadpool.threads)) + threadpool_total_working_threads.labels( + name=name, **{SERVER_NAME_LABEL: server_name} + ).set_function(lambda: len(threadpool.working)) class MetricsResource(Resource): diff --git a/synapse/server.py b/synapse/server.py index df4474f3c14..ec569434f82 100644 --- a/synapse/server.py +++ b/synapse/server.py @@ -980,7 +980,10 @@ def get_media_sender_thread_pool(self) -> ThreadPool: ) # Register the threadpool with our metrics. - register_threadpool("media", media_threadpool) + server_name = self.hostname + register_threadpool( + name="media", server_name=server_name, threadpool=media_threadpool + ) return media_threadpool diff --git a/synapse/storage/database.py b/synapse/storage/database.py index 7f2cc9625a4..01bbe772352 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -118,9 +118,11 @@ def reconnect(self) -> None: ... def make_pool( + *, reactor: IReactorCore, db_config: DatabaseConnectionConfig, engine: BaseDatabaseEngine, + server_name: str, ) -> adbapi.ConnectionPool: """Get the connection pool for the database.""" @@ -144,7 +146,11 @@ def _on_new_connection(conn: Connection) -> None: **db_args, ) - register_threadpool(f"database-{db_config.name}", connection_pool.threadpool) + register_threadpool( + name=f"database-{db_config.name}", + server_name=server_name, + threadpool=connection_pool.threadpool, + ) return connection_pool @@ -565,7 +571,12 @@ def __init__( self._clock = hs.get_clock() self._txn_limit = database_config.config.get("txn_limit", 0) self._database_config = database_config - self._db_pool = make_pool(hs.get_reactor(), database_config, engine) + self._db_pool = make_pool( + reactor=hs.get_reactor(), + db_config=database_config, + engine=engine, + server_name=self.server_name, + ) self.updates = BackgroundUpdater(hs, self) LaterGauge( diff --git a/tests/server.py b/tests/server.py index 0c519bc4c98..4fbc709edee 100644 --- a/tests/server.py +++ b/tests/server.py @@ -710,7 +710,9 @@ def make_fake_db_pool( is a drop-in replacement for the normal `make_pool` which builds such a connection pool. """ - pool = make_pool(reactor, db_config, engine) + pool = make_pool( + reactor=reactor, db_config=db_config, engine=engine, server_name="test_server" + ) def runWithConnection( func: Callable[..., R], *args: Any, **kwargs: Any From 0c93d85b047b437458f42577db099452b21072fb Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:31:32 -0500 Subject: [PATCH 07/23] Fill in `synapse/metrics/_gc.py` --- synapse/metrics/_gc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index d16481a0f66..ee86e274790 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -54,7 +54,8 @@ # Python GC metrics # -gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) +# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. +gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) # type: ignore[missing-server-name-label] gc_time = Histogram( "python_gc_time", "Time taken to GC (sec)", From fe1a16f9da56c96e116e4ff3e76afed2a015f302 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:33:12 -0500 Subject: [PATCH 08/23] Fill in `synapse/metrics/common_usage_metrics.py` --- synapse/metrics/common_usage_metrics.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/synapse/metrics/common_usage_metrics.py b/synapse/metrics/common_usage_metrics.py index ea1ffd171de..cd1c3c86499 100644 --- a/synapse/metrics/common_usage_metrics.py +++ b/synapse/metrics/common_usage_metrics.py @@ -22,6 +22,7 @@ import attr +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process if TYPE_CHECKING: @@ -33,6 +34,7 @@ current_dau_gauge = Gauge( "synapse_admin_daily_active_users", "Current daily active users count", + labelnames=[SERVER_NAME_LABEL], ) @@ -89,4 +91,6 @@ async def _update_gauges(self) -> None: """Update the Prometheus gauges.""" metrics = await self._collect() - current_dau_gauge.set(float(metrics.daily_active_users)) + current_dau_gauge.labels( + **{SERVER_NAME_LABEL: self.server_name}, + ).set(float(metrics.daily_active_users)) From 944df9ccebe7b58004120c3adb21cbb91e3c2453 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:35:20 -0500 Subject: [PATCH 09/23] Fill in `synapse/push/pusherpool.py` --- synapse/push/pusherpool.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py index d6d5de2ec5b..ee51872c2c5 100644 --- a/synapse/push/pusherpool.py +++ b/synapse/push/pusherpool.py @@ -25,6 +25,7 @@ from prometheus_client import Gauge from synapse.api.errors import Codes, SynapseError +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, @@ -44,7 +45,9 @@ synapse_pushers = Gauge( - "synapse_pushers", "Number of active synapse pushers", ["kind", "app_id"] + "synapse_pushers", + "Number of active synapse pushers", + labelnames=["kind", "app_id", SERVER_NAME_LABEL], ) @@ -420,11 +423,17 @@ async def _start_pusher(self, pusher_config: PusherConfig) -> Optional[Pusher]: previous_pusher.on_stop() synapse_pushers.labels( - type(previous_pusher).__name__, previous_pusher.app_id + kind=type(previous_pusher).__name__, + app_id=previous_pusher.app_id, + **{SERVER_NAME_LABEL: self.server_name}, ).dec() byuser[appid_pushkey] = pusher - synapse_pushers.labels(type(pusher).__name__, pusher.app_id).inc() + synapse_pushers.labels( + kind=type(pusher).__name__, + app_id=pusher.app_id, + **{SERVER_NAME_LABEL: self.server_name}, + ).inc() logger.info("Starting pusher %s / %s", pusher.user_id, appid_pushkey) @@ -476,4 +485,8 @@ def maybe_stop_pusher(self, app_id: str, pushkey: str, user_id: str) -> None: pusher = byuser.pop(appid_pushkey) pusher.on_stop() - synapse_pushers.labels(type(pusher).__name__, pusher.app_id).dec() + synapse_pushers.labels( + kind=type(pusher).__name__, + app_id=pusher.app_id, + **{SERVER_NAME_LABEL: self.server_name}, + ).dec() From 54e2374dbe7b38e7ef762a71512c55b47f178cee Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:37:23 -0500 Subject: [PATCH 10/23] Fill in `synapse/replication/http/_base.py` --- synapse/replication/http/_base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 31204a83849..9db3439002b 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -38,6 +38,7 @@ from synapse.http.site import SynapseRequest from synapse.logging import opentracing from synapse.logging.opentracing import trace_with_opname +from synapse.metrics import SERVER_NAME_LABEL from synapse.types import JsonDict from synapse.util.caches.response_cache import ResponseCache from synapse.util.cancellation import is_function_cancellable @@ -51,7 +52,7 @@ _pending_outgoing_requests = Gauge( "synapse_pending_outgoing_replication_requests", "Number of active outgoing replication requests, by replication method name", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) _outgoing_request_counter = Counter( @@ -205,13 +206,17 @@ def make_client(cls, hs: "HomeServer") -> Callable: parameter to specify which instance to hit (the instance must be in the `instance_map` config). """ + server_name = hs.hostname clock = hs.get_clock() client = hs.get_replication_client() local_instance_name = hs.get_instance_name() instance_map = hs.config.worker.instance_map - outgoing_gauge = _pending_outgoing_requests.labels(cls.NAME) + outgoing_gauge = _pending_outgoing_requests.labels( + name=cls.NAME, + **{SERVER_NAME_LABEL: server_name}, + ) replication_secret = None if hs.config.worker.worker_replication_secret: From 883062b8ffbe87bc2cb00785448ab0a82e818f30 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:38:56 -0500 Subject: [PATCH 11/23] Fill in `synapse/storage/databases/main/event_federation.py` --- synapse/storage/databases/main/event_federation.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 8e623bf0617..5c0ae578814 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -45,6 +45,7 @@ from synapse.api.room_versions import EventFormatVersions, RoomVersion from synapse.events import EventBase, make_event_from_dict from synapse.logging.opentracing import tag_args, trace +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.background_updates import ForeignKeyConstraint @@ -70,11 +71,13 @@ oldest_pdu_in_federation_staging = Gauge( "synapse_federation_server_oldest_inbound_pdu_in_staging", "The age in seconds since we received the oldest pdu in the federation staging area", + labelnames=[SERVER_NAME_LABEL], ) number_pdus_in_federation_queue = Gauge( "synapse_federation_server_number_inbound_pdu_in_staging", "The total number of events in the inbound federation staging", + labelnames=[SERVER_NAME_LABEL], ) pdus_pruned_from_federation_queue = Counter( @@ -2056,8 +2059,12 @@ def _get_stats_for_federation_staging_txn( "_get_stats_for_federation_staging", _get_stats_for_federation_staging_txn ) - number_pdus_in_federation_queue.set(count) - oldest_pdu_in_federation_staging.set(age) + number_pdus_in_federation_queue.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).set(count) + oldest_pdu_in_federation_staging.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).set(age) async def clean_room_for_join(self, room_id: str) -> None: await self.db_pool.runInteraction( From 90efa417a695c9843aea66dd17359867ef675e9b Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:40:24 -0500 Subject: [PATCH 12/23] Fill in `synapse/storage/databases/main/events_worker.py` --- synapse/storage/databases/main/events_worker.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index 56209716514..349674f28de 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -68,6 +68,7 @@ tag_args, trace, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, @@ -138,6 +139,7 @@ def __init__( event_fetch_ongoing_gauge = Gauge( "synapse_event_fetch_ongoing", "The number of event fetchers that are running", + labelnames=[SERVER_NAME_LABEL], ) @@ -312,7 +314,9 @@ def __init__( Tuple[Iterable[str], "defer.Deferred[Dict[str, _EventRow]]"] ] = [] self._event_fetch_ongoing = 0 - event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) + event_fetch_ongoing_gauge.labels(**{SERVER_NAME_LABEL: self.server_name}).set( + self._event_fetch_ongoing + ) # We define this sequence here so that it can be referenced from both # the DataStore and PersistEventStore. @@ -1134,7 +1138,9 @@ def _maybe_start_fetch_thread(self) -> None: and self._event_fetch_ongoing < EVENT_QUEUE_THREADS ): self._event_fetch_ongoing += 1 - event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) + event_fetch_ongoing_gauge.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).set(self._event_fetch_ongoing) # `_event_fetch_ongoing` is decremented in `_fetch_thread`. should_start = True else: @@ -1158,7 +1164,9 @@ async def _fetch_thread(self) -> None: event_fetches_to_fail = [] with self._event_fetch_lock: self._event_fetch_ongoing -= 1 - event_fetch_ongoing_gauge.set(self._event_fetch_ongoing) + event_fetch_ongoing_gauge.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).set(self._event_fetch_ongoing) # There may still be work remaining in `_event_fetch_list` if we # failed, or it was added in between us deciding to exit and From f4b6d35e7c3eecaf9de455677ff336b1a5d3ecfd Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:44:17 -0500 Subject: [PATCH 13/23] Fill in `synapse/util/batching_queue.py` --- synapse/util/batching_queue.py | 21 +++++++++++++-------- tests/util/test_batching_queue.py | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index 3a2d664d28d..4c0f129423b 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -37,6 +37,7 @@ from twisted.internet import defer from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.util import Clock @@ -49,19 +50,19 @@ number_queued = Gauge( "synapse_util_batching_queue_number_queued", "The number of items waiting in the queue across all keys", - labelnames=("name",), + labelnames=("name", SERVER_NAME_LABEL), ) number_in_flight = Gauge( "synapse_util_batching_queue_number_pending", "The number of items across all keys either being processed or waiting in a queue", - labelnames=("name",), + labelnames=("name", SERVER_NAME_LABEL), ) number_of_keys = Gauge( "synapse_util_batching_queue_number_of_keys", "The number of distinct keys that have items queued", - labelnames=("name",), + labelnames=("name", SERVER_NAME_LABEL), ) @@ -114,13 +115,17 @@ def __init__( # The function to call with batches of values. self._process_batch_callback = process_batch_callback - number_queued.labels(self._name).set_function( - lambda: sum(len(q) for q in self._next_values.values()) - ) + number_queued.labels( + name=self._name, **{SERVER_NAME_LABEL: self.server_name} + ).set_function(lambda: sum(len(q) for q in self._next_values.values())) - number_of_keys.labels(self._name).set_function(lambda: len(self._next_values)) + number_of_keys.labels( + name=self._name, **{SERVER_NAME_LABEL: self.server_name} + ).set_function(lambda: len(self._next_values)) - self._number_in_flight_metric: Gauge = number_in_flight.labels(self._name) + self._number_in_flight_metric: Gauge = number_in_flight.labels( + name=self._name, **{SERVER_NAME_LABEL: self.server_name} + ) async def add_to_queue(self, value: V, key: Hashable = ()) -> R: """Adds the value to the queue with the given key, returning the result diff --git a/tests/util/test_batching_queue.py b/tests/util/test_batching_queue.py index 49ddc117255..532582cf877 100644 --- a/tests/util/test_batching_queue.py +++ b/tests/util/test_batching_queue.py @@ -42,9 +42,9 @@ def setUp(self) -> None: # We ensure that we remove any existing metrics for "test_queue". try: - number_queued.remove("test_queue") - number_of_keys.remove("test_queue") - number_in_flight.remove("test_queue") + number_queued.remove("test_queue", "test_server") + number_of_keys.remove("test_queue", "test_server") + number_in_flight.remove("test_queue", "test_server") except KeyError: pass From 656c3ad8eec6cc6d57809270380cf6aaa18f3065 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:46:03 -0500 Subject: [PATCH 14/23] Fill in `synapse/util/caches/deferred_cache.py` --- synapse/util/caches/deferred_cache.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/synapse/util/caches/deferred_cache.py b/synapse/util/caches/deferred_cache.py index 0c6c912918a..92d446ce2aa 100644 --- a/synapse/util/caches/deferred_cache.py +++ b/synapse/util/caches/deferred_cache.py @@ -43,6 +43,7 @@ from twisted.internet import defer from twisted.python.failure import Failure +from synapse.metrics import SERVER_NAME_LABEL from synapse.util.async_helpers import ObservableDeferred from synapse.util.caches.lrucache import LruCache from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry @@ -50,7 +51,7 @@ cache_pending_metric = Gauge( "synapse_util_caches_cache_pending", "Number of lookups currently pending for this cache", - ["name"], + labelnames=["name", SERVER_NAME_LABEL], ) T = TypeVar("T") @@ -111,7 +112,9 @@ def __init__( ] = cache_type() def metrics_cb() -> None: - cache_pending_metric.labels(name).set(len(self._pending_deferred_cache)) + cache_pending_metric.labels( + name=name, **{SERVER_NAME_LABEL: server_name} + ).set(len(self._pending_deferred_cache)) # cache is used for completed results and maps to the result itself, rather than # a Deferred. From c55e61522535807cb79965642d1edb60f506d08e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:49:04 -0500 Subject: [PATCH 15/23] Add changelog --- changelog.d/18725.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/18725.misc diff --git a/changelog.d/18725.misc b/changelog.d/18725.misc new file mode 100644 index 00000000000..7fa5b47b891 --- /dev/null +++ b/changelog.d/18725.misc @@ -0,0 +1 @@ +Refactor `Gauge` metrics to be homeserver-scoped. From 0fd34a67ee07794b776d9fd369a4a38ee1228e48 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 17:55:29 -0500 Subject: [PATCH 16/23] Add upgrade notes --- docs/upgrade.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/upgrade.md b/docs/upgrade.md index e79ca93c040..9decc43727b 100644 --- a/docs/upgrade.md +++ b/docs/upgrade.md @@ -117,6 +117,25 @@ each upgrade are complete before moving on to the next upgrade, to avoid stacking them up. You can monitor the currently running background updates with [the Admin API](usage/administration/admin_api/background_updates.html#status). +# Upgrading to v1.136.0 + +## Metric labels have changed on `synapse_federation_last_received_pdu_time` and `synapse_federation_last_sent_pdu_time` + +Previously, the `synapse_federation_last_received_pdu_time` and +`synapse_federation_last_sent_pdu_time` metrics both used the `server_name` label to +differentiate between different servers that we send and receive events from. + +Since we're now using the `server_name` label to differentiate between different Synapse +homeserver instances running in the same process, these metrics have been changed as follows: + + - `synapse_federation_last_received_pdu_time` now uses the `origin_server_name` label + - `synapse_federation_last_sent_pdu_time` now uses the `destination_server_name` label + +The Grafana dashboard JSON in `contrib/grafana/synapse.json` has been updated to reflect +this change but you will need to manually update your own existing Grafana dashboards +using these metrics. + + # Upgrading to v1.135.0 ## `on_user_registration` module API callback may now run on any worker From 563f543074fc868049f02274de4a866f9289122c Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 18:26:30 -0500 Subject: [PATCH 17/23] Fix `make_fake_db_pool` --- tests/server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/server.py b/tests/server.py index 4fbc709edee..bb9533126d1 100644 --- a/tests/server.py +++ b/tests/server.py @@ -702,6 +702,7 @@ def make_fake_db_pool( reactor: ISynapseReactor, db_config: DatabaseConnectionConfig, engine: BaseDatabaseEngine, + server_name: str, ) -> adbapi.ConnectionPool: """Wrapper for `make_pool` which builds a pool which runs db queries synchronously. @@ -711,7 +712,7 @@ def make_fake_db_pool( pool. """ pool = make_pool( - reactor=reactor, db_config=db_config, engine=engine, server_name="test_server" + reactor=reactor, db_config=db_config, engine=engine, server_name=server_name ) def runWithConnection( From 554d588d9b69ab4512bdf5d185a6ace1f2008876 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 18:38:33 -0500 Subject: [PATCH 18/23] Fix `event_persisted_position` usage --- synapse/storage/databases/main/events.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index c0299cb62e8..438745b0283 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -62,6 +62,7 @@ LoggingTransaction, make_tuple_in_list_sql_clause, ) +from synapse.metrics import SERVER_NAME_LABEL from synapse.storage.databases.main.event_federation import EventFederationStore from synapse.storage.databases.main.events_worker import EventCacheEntry from synapse.storage.databases.main.search import SearchEntry @@ -237,6 +238,7 @@ def __init__( db_conn: LoggingDatabaseConnection, ): self.hs = hs + self.server_name = hs.hostname self.db_pool = db self.store = main_data_store self.database_engine = db.engine @@ -362,7 +364,9 @@ async def _persist_events_and_state_updates( if not use_negative_stream_ordering: # we don't want to set the event_persisted_position to a negative # stream_ordering. - synapse.metrics.event_persisted_position.set(stream) + synapse.metrics.event_persisted_position.labels( + **{SERVER_NAME_LABEL: self.server_name} + ).set(stream) for event, context in events_and_contexts: if context.app_service: From 2536aaffad770cb61ee50e307fb015f664d3afe8 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 18:57:27 -0500 Subject: [PATCH 19/23] Fix lints --- synapse/storage/databases/main/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 438745b0283..f0eb671b726 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -55,6 +55,7 @@ from synapse.events.snapshot import EventContext from synapse.events.utils import parse_stripped_state_event from synapse.logging.opentracing import trace +from synapse.metrics import SERVER_NAME_LABEL from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.database import ( DatabasePool, @@ -62,7 +63,6 @@ LoggingTransaction, make_tuple_in_list_sql_clause, ) -from synapse.metrics import SERVER_NAME_LABEL from synapse.storage.databases.main.event_federation import EventFederationStore from synapse.storage.databases.main.events_worker import EventCacheEntry from synapse.storage.databases.main.search import SearchEntry From 7b55ffb977dd26528967628e638b539d3715121c Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Wed, 23 Jul 2025 19:15:39 -0500 Subject: [PATCH 20/23] Fill in event `Gauge` metrics from `synapse/metrics/__init__.py` --- synapse/federation/sender/__init__.py | 9 ++++++--- synapse/handlers/appservice.py | 10 +++++++--- synapse/handlers/delayed_events.py | 6 ++++-- synapse/handlers/presence.py | 8 ++++---- synapse/handlers/room_member.py | 6 ++++-- synapse/handlers/stats.py | 6 ++++-- synapse/handlers/user_directory.py | 7 ++++--- 7 files changed, 33 insertions(+), 19 deletions(-) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index e223b2da1c1..98ead69183a 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -160,6 +160,7 @@ from synapse.federation.units import Edu from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.metrics import ( + SERVER_NAME_LABEL, LaterGauge, event_processing_loop_counter, event_processing_loop_room_count, @@ -702,10 +703,12 @@ async def handle_room_events(events: List[EventBase]) -> None: assert ts is not None synapse.metrics.event_processing_lag.labels( - "federation_sender" + name="federation_sender", + **{SERVER_NAME_LABEL: self.server_name}, ).set(now - ts) synapse.metrics.event_processing_last_ts.labels( - "federation_sender" + name="federation_sender", + **{SERVER_NAME_LABEL: self.server_name}, ).set(ts) events_processed_counter.inc(len(event_entries)) @@ -717,7 +720,7 @@ async def handle_room_events(events: List[EventBase]) -> None: event_processing_loop_counter.labels("federation_sender").inc() synapse.metrics.event_processing_positions.labels( - "federation_sender" + name="federation_sender", **{SERVER_NAME_LABEL: self.server_name} ).set(next_token) finally: diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index dca500f37b2..bc9fda567ac 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -42,6 +42,7 @@ from synapse.handlers.presence import format_user_presence_state from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.metrics import ( + SERVER_NAME_LABEL, event_processing_loop_counter, event_processing_loop_room_count, ) @@ -204,7 +205,8 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: await self.store.set_appservice_last_pos(upper_bound) synapse.metrics.event_processing_positions.labels( - "appservice_sender" + name="appservice_sender", + **{SERVER_NAME_LABEL: self.server_name}, ).set(upper_bound) events_processed_counter.inc(len(events)) @@ -221,10 +223,12 @@ async def handle_room_events(events: Iterable[EventBase]) -> None: assert ts is not None synapse.metrics.event_processing_lag.labels( - "appservice_sender" + name="appservice_sender", + **{SERVER_NAME_LABEL: self.server_name}, ).set(now - ts) synapse.metrics.event_processing_last_ts.labels( - "appservice_sender" + name="appservice_sender", + **{SERVER_NAME_LABEL: self.server_name}, ).set(ts) finally: self.is_processing = False diff --git a/synapse/handlers/delayed_events.py b/synapse/handlers/delayed_events.py index a093505ca02..ce13dcc737c 100644 --- a/synapse/handlers/delayed_events.py +++ b/synapse/handlers/delayed_events.py @@ -22,7 +22,7 @@ from synapse.api.ratelimiting import Ratelimiter from synapse.config.workers import MAIN_PROCESS_INSTANCE_NAME from synapse.logging.opentracing import set_tag -from synapse.metrics import event_processing_positions +from synapse.metrics import SERVER_NAME_LABEL, event_processing_positions from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.http.delayed_events import ( ReplicationAddedDelayedEventRestServlet, @@ -191,7 +191,9 @@ async def _unsafe_process_new_event(self) -> None: self._event_pos = max_pos # Expose current event processing position to prometheus - event_processing_positions.labels("delayed_events").set(max_pos) + event_processing_positions.labels( + name="delayed_events", **{SERVER_NAME_LABEL: self.server_name} + ).set(max_pos) await self._store.update_delayed_events_stream_pos(max_pos) diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index db070c60efb..012b6c0d62c 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -105,7 +105,7 @@ from synapse.appservice import ApplicationService from synapse.events.presence_router import PresenceRouter from synapse.logging.context import run_in_background -from synapse.metrics import LaterGauge +from synapse.metrics import SERVER_NAME_LABEL, LaterGauge from synapse.metrics.background_process_metrics import ( run_as_background_process, wrap_as_background_process, @@ -1539,9 +1539,9 @@ async def _unsafe_process(self) -> None: self._event_pos = max_pos # Expose current event processing position to prometheus - synapse.metrics.event_processing_positions.labels("presence").set( - max_pos - ) + synapse.metrics.event_processing_positions.labels( + name="presence", **{SERVER_NAME_LABEL: self.server_name} + ).set(max_pos) async def _handle_state_delta(self, room_id: str, deltas: List[StateDelta]) -> None: """Process current state deltas for the room to find new joins that need diff --git a/synapse/handlers/room_member.py b/synapse/handlers/room_member.py index 897731e4dfd..379c8430576 100644 --- a/synapse/handlers/room_member.py +++ b/synapse/handlers/room_member.py @@ -49,7 +49,7 @@ from synapse.handlers.state_deltas import MatchChange, StateDeltasHandler from synapse.handlers.worker_lock import NEW_EVENT_DURING_PURGE_LOCK_NAME from synapse.logging import opentracing -from synapse.metrics import event_processing_positions +from synapse.metrics import SERVER_NAME_LABEL, event_processing_positions from synapse.metrics.background_process_metrics import run_as_background_process from synapse.replication.http.push import ReplicationCopyPusherRestServlet from synapse.storage.databases.main.state_deltas import StateDelta @@ -2255,7 +2255,9 @@ async def _unsafe_process(self) -> None: self.pos = max_pos # Expose current event processing position to prometheus - event_processing_positions.labels("room_forgetter").set(max_pos) + event_processing_positions.labels( + name="room_forgetter", **{SERVER_NAME_LABEL: self.server_name} + ).set(max_pos) await self._store.update_room_forgetter_stream_pos(max_pos) diff --git a/synapse/handlers/stats.py b/synapse/handlers/stats.py index 5cd4ec2b82f..a2602ea818e 100644 --- a/synapse/handlers/stats.py +++ b/synapse/handlers/stats.py @@ -32,7 +32,7 @@ ) from synapse.api.constants import EventContentFields, EventTypes, Membership -from synapse.metrics import event_processing_positions +from synapse.metrics import SERVER_NAME_LABEL, event_processing_positions from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.databases.main.state_deltas import StateDelta from synapse.types import JsonDict @@ -147,7 +147,9 @@ async def _unsafe_process(self) -> None: logger.debug("Handled room stats to %s -> %s", self.pos, max_pos) - event_processing_positions.labels("stats").set(max_pos) + event_processing_positions.labels( + name="stats", **{SERVER_NAME_LABEL: self.server_name} + ).set(max_pos) self.pos = max_pos diff --git a/synapse/handlers/user_directory.py b/synapse/handlers/user_directory.py index 17a3a87d865..130099a2390 100644 --- a/synapse/handlers/user_directory.py +++ b/synapse/handlers/user_directory.py @@ -35,6 +35,7 @@ ) from synapse.api.errors import Codes, SynapseError from synapse.handlers.state_deltas import MatchChange, StateDeltasHandler +from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.databases.main.state_deltas import StateDelta from synapse.storage.databases.main.user_directory import SearchResult @@ -262,9 +263,9 @@ async def _unsafe_process(self) -> None: self.pos = max_pos # Expose current event processing position to prometheus - synapse.metrics.event_processing_positions.labels("user_dir").set( - max_pos - ) + synapse.metrics.event_processing_positions.labels( + name="user_dir", **{SERVER_NAME_LABEL: self.server_name} + ).set(max_pos) await self.store.update_user_directory_stream_pos(max_pos) From d587aa9e86edbe91cdf3028b2a3957d1cb32b6cc Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Thu, 24 Jul 2025 15:35:57 -0500 Subject: [PATCH 21/23] Remove debug log --- scripts-dev/mypy_synapse_plugin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index fbb46c9f87e..72bc3e3fdd3 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -71,7 +71,6 @@ def get_function_signature_hook( def get_method_signature_hook( self, fullname: str ) -> Optional[Callable[[MethodSigContext], CallableType]]: - # print(f"m fullname={fullname}") if fullname.startswith( ( "synapse.util.caches.descriptors.CachedFunction.__call__", From 650ce32936e150aadff53dd8b08c621d8ed3f9e1 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 25 Jul 2025 15:21:59 -0500 Subject: [PATCH 22/23] Fix grammar See https://github.com/element-hq/synapse/pull/18725#discussion_r2229692212 --- scripts-dev/mypy_synapse_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 7804214391e..24f5eae4b5a 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -99,7 +99,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy ensures metrics are correctly separated by homeserver. There are also some metrics that apply at the process level, such as CPU usage, - Python garbage collection, Twisted reactor tick time which shouldn't have the + Python garbage collection, and Twisted reactor tick time, which shouldn't have the `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. """ From 4387262193f91af41e52c96058c8c76b840ea48a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 25 Jul 2025 15:22:31 -0500 Subject: [PATCH 23/23] Fix duplicate word typo See https://github.com/element-hq/synapse/pull/18725#discussion_r2229706490 --- scripts-dev/mypy_synapse_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 24f5eae4b5a..3eab2b3b73b 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -100,7 +100,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy There are also some metrics that apply at the process level, such as CPU usage, Python garbage collection, and Twisted reactor tick time, which shouldn't have the - `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the + `SERVER_NAME_LABEL`. In those cases, use a type ignore comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. """ # The true signature, this isn't being modified so this is what will be returned.