Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b14eed7
Add in base linting for metrics
MadLittleMods Jul 23, 2025
4bb84a6
Fill in `synapse/app/phone_stats_home.py`
MadLittleMods Jul 23, 2025
80d5fd5
Support `labelnames` argument being a Tuple expression
MadLittleMods Jul 23, 2025
1504100
Fill in `synapse/federation/federation_server.py`
MadLittleMods Jul 23, 2025
608f72e
Fill in `synapse/federation/sender/transaction_manager.py`
MadLittleMods Jul 23, 2025
0a2877e
Fill in `synapse/metrics/__init__.py`
MadLittleMods Jul 23, 2025
0c93d85
Fill in `synapse/metrics/_gc.py`
MadLittleMods Jul 23, 2025
fe1a16f
Fill in `synapse/metrics/common_usage_metrics.py`
MadLittleMods Jul 23, 2025
944df9c
Fill in `synapse/push/pusherpool.py`
MadLittleMods Jul 23, 2025
54e2374
Fill in `synapse/replication/http/_base.py`
MadLittleMods Jul 23, 2025
883062b
Fill in `synapse/storage/databases/main/event_federation.py`
MadLittleMods Jul 23, 2025
90efa41
Fill in `synapse/storage/databases/main/events_worker.py`
MadLittleMods Jul 23, 2025
f4b6d35
Fill in `synapse/util/batching_queue.py`
MadLittleMods Jul 23, 2025
656c3ad
Fill in `synapse/util/caches/deferred_cache.py`
MadLittleMods Jul 23, 2025
c55e615
Add changelog
MadLittleMods Jul 23, 2025
0fd34a6
Add upgrade notes
MadLittleMods Jul 23, 2025
563f543
Fix `make_fake_db_pool`
MadLittleMods Jul 23, 2025
554d588
Fix `event_persisted_position` usage
MadLittleMods Jul 23, 2025
2536aaf
Fix lints
MadLittleMods Jul 23, 2025
7b55ffb
Fill in event `Gauge` metrics from `synapse/metrics/__init__.py`
MadLittleMods Jul 24, 2025
68061f9
Merge branch 'develop' into madlittlemods/18592-refactor-gauge
MadLittleMods Jul 24, 2025
d587aa9
Remove debug log
MadLittleMods Jul 24, 2025
def4eb5
Merge branch 'develop' into madlittlemods/18592-refactor-gauge
MadLittleMods Jul 25, 2025
650ce32
Fix grammar
MadLittleMods Jul 25, 2025
4387262
Fix duplicate word typo
MadLittleMods Jul 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions contrib/grafana/synapse.json
Original file line number Diff line number Diff line change
Expand Up @@ -4396,7 +4396,7 @@
"exemplar": false,
"expr": "(time() - max without (job, index, host) (avg_over_time(synapse_federation_last_received_pdu_time[10m]))) / 60",
"instant": false,
"legendFormat": "{{server_name}} ",
"legendFormat": "{{origin_server_name}} ",
"range": true,
"refId": "A"
}
Expand Down Expand Up @@ -4518,7 +4518,7 @@
"exemplar": false,
"expr": "(time() - max without (job, index, host) (avg_over_time(synapse_federation_last_sent_pdu_time[10m]))) / 60",
"instant": false,
"legendFormat": "{{server_name}}",
"legendFormat": "{{destination_server_name}}",
"range": true,
"refId": "A"
}
Expand Down
107 changes: 105 additions & 2 deletions scripts-dev/mypy_synapse_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,13 @@
import mypy.types
from mypy.erasetype import remove_instance_last_known_values
from mypy.errorcodes import ErrorCode
from mypy.nodes import ARG_NAMED_OPT, TempNode, Var
from mypy.plugin import FunctionSigContext, MethodSigContext, Plugin
from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, TupleExpr, Var
from mypy.plugin import (
FunctionLike,
FunctionSigContext,
MethodSigContext,
Plugin,
)
from mypy.typeops import bind_self
from mypy.types import (
AnyType,
Expand All @@ -43,11 +48,30 @@
UnionType,
)

PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
"missing-server-name-label",
"`SERVER_NAME_LABEL` required in metric",
category="per-homeserver-tenant-metrics",
)


class SynapsePlugin(Plugin):
def get_function_signature_hook(
self, fullname: str
) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
if fullname in (
"prometheus_client.metrics.Gauge",
# TODO: Add other prometheus_client metrics that need checking as we
# refactor, see https://github.com/element-hq/synapse/issues/18592
):
return check_prometheus_metric_instantiation

return None

def get_method_signature_hook(
self, fullname: str
) -> Optional[Callable[[MethodSigContext], CallableType]]:
# print(f"m fullname={fullname}")
if fullname.startswith(
(
"synapse.util.caches.descriptors.CachedFunction.__call__",
Expand All @@ -65,6 +89,85 @@ def get_method_signature_hook(
return None


def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for adding this lint!
This should make maintenance of these changes much more manageable going forward.

"""
Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label
when instantiated.

This is important because we support multiple Synapse instances running in the same
process, where all metrics share a single global `REGISTRY`. The `server_name` label
ensures metrics are correctly separated by homeserver.

There are also some metrics that apply at the process level, such as CPU usage,
Python garbage collection, Twisted reactor tick time which shouldn't have the
Comment thread
MadLittleMods marked this conversation as resolved.
Outdated
`SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the
Comment thread
MadLittleMods marked this conversation as resolved.
Outdated
check, e.g. `# type: ignore[missing-server-name-label]`.
"""
# The true signature, this isn't being modified so this is what will be returned.
signature: CallableType = ctx.default_signature

# Sanity check the arguments are still as expected in this version of
# `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)`
#
# `signature.arg_names` should be: ["name", "documentation", "labelnames", ...]
if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames":
ctx.api.fail(
f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got "
f"{signature.arg_names[2]}",
ctx.context,
)
return signature

# Ensure mypy is passing the correct number of arguments because we are doing some
# dirty indexing into `ctx.args` later on.
assert len(ctx.args) == len(signature.arg_names), (
f"Expected the list of arguments in the {signature.name} signature ({len(signature.arg_names)})"
f"to match the number of arguments from the function signature context ({len(ctx.args)})"
)

# Check if the `labelnames` argument includes `SERVER_NAME_LABEL`
#
# `ctx.args` should look like this:
# ```
# [
# [StrExpr("name")],
# [StrExpr("documentation")],
# [ListExpr([StrExpr("label1"), StrExpr("label2")])]
# ...
# ]
# ```
labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
if isinstance(labelnames_arg_expression, (ListExpr, TupleExpr)):
# Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
for labelname_expression in labelnames_arg_expression.items:
if (
isinstance(labelname_expression, NameExpr)
and labelname_expression.fullname == "synapse.metrics.SERVER_NAME_LABEL"
):
# Found the `SERVER_NAME_LABEL`, all good!
break
else:
ctx.api.fail(
f"Expected {signature.name} to include `SERVER_NAME_LABEL` in the list of labels. "
"If this is a process-level metric (vs homeserver-level), use a type ignore comment "
"to disable this check.",
ctx.context,
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
)
else:
ctx.api.fail(
f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
f"(including `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. "
"If this is a process-level metric (vs homeserver-level), use a type ignore comment "
"to disable this check.",
ctx.context,
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
)
return signature

return signature


def _get_true_return_type(signature: CallableType) -> mypy.types.Type:
"""
Get the "final" return type of a callable which might return an Awaitable/Deferred.
Expand Down
8 changes: 6 additions & 2 deletions synapse/app/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,8 +525,12 @@ async def start(hs: "HomeServer") -> None:
)

# Register the threadpools with our metrics.
register_threadpool("default", reactor.getThreadPool())
register_threadpool("gai_resolver", resolver_threadpool)
register_threadpool(
name="default", server_name=server_name, threadpool=reactor.getThreadPool()
)
register_threadpool(
name="gai_resolver", server_name=server_name, threadpool=resolver_threadpool
)

# Set up the SIGHUP machinery.
if hasattr(signal, "SIGHUP"):
Expand Down
34 changes: 26 additions & 8 deletions synapse/app/phone_stats_home.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from twisted.internet import defer

from synapse.metrics import SERVER_NAME_LABEL
from synapse.metrics.background_process_metrics import (
run_as_background_process,
)
Expand Down Expand Up @@ -57,16 +58,25 @@
_stats_process: List[Tuple[int, "resource.struct_rusage"]] = []

# Gauges to expose monthly active user control metrics
current_mau_gauge = Gauge("synapse_admin_mau_current", "Current MAU")
current_mau_gauge = Gauge(
"synapse_admin_mau_current",
"Current MAU",
labelnames=[SERVER_NAME_LABEL],
)
current_mau_by_service_gauge = Gauge(
"synapse_admin_mau_current_mau_by_service",
"Current MAU by service",
["app_service"],
labelnames=["app_service", SERVER_NAME_LABEL],
)
max_mau_gauge = Gauge(
"synapse_admin_mau_max",
"MAU Limit",
labelnames=[SERVER_NAME_LABEL],
)
max_mau_gauge = Gauge("synapse_admin_mau_max", "MAU Limit")
registered_reserved_users_mau_gauge = Gauge(
"synapse_admin_mau_registered_reserved_users",
"Registered users with reserved threepids",
labelnames=[SERVER_NAME_LABEL],
)


Expand Down Expand Up @@ -237,13 +247,21 @@ async def _generate_monthly_active_users() -> None:
await store.get_monthly_active_count_by_service()
)
reserved_users = await store.get_registered_reserved_users()
current_mau_gauge.set(float(current_mau_count))
current_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set(
float(current_mau_count)
)

for app_service, count in current_mau_count_by_service.items():
current_mau_by_service_gauge.labels(app_service).set(float(count))

registered_reserved_users_mau_gauge.set(float(len(reserved_users)))
max_mau_gauge.set(float(hs.config.server.max_mau_value))
current_mau_by_service_gauge.labels(
app_service=app_service, **{SERVER_NAME_LABEL: server_name}
).set(float(count))

registered_reserved_users_mau_gauge.labels(
**{SERVER_NAME_LABEL: server_name}
).set(float(len(reserved_users)))
max_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set(
float(hs.config.server.max_mau_value)
)

return run_as_background_process(
"generate_monthly_active_users",
Expand Down
7 changes: 5 additions & 2 deletions synapse/federation/federation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
tag_args,
trace,
)
from synapse.metrics import SERVER_NAME_LABEL
from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.replication.http.federation import (
ReplicationFederationSendEduRestServlet,
Expand Down Expand Up @@ -120,7 +121,7 @@
last_pdu_ts_metric = Gauge(
"synapse_federation_last_received_pdu_time",
"The timestamp of the last PDU which was successfully received from the given domain",
labelnames=("server_name",),
labelnames=("origin_server_name", SERVER_NAME_LABEL),
Copy link
Copy Markdown
Contributor Author

@MadLittleMods MadLittleMods Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This metric was already using the server_name label so I've had to rename the label. I've updated the contrib/grafana/synapse.json but it's something we probably want to call out in the upgrade notes (added)

)


Expand Down Expand Up @@ -545,7 +546,9 @@ async def process_pdu(pdu: EventBase) -> JsonDict:
)

if newest_pdu_ts and origin in self._federation_metrics_domains:
last_pdu_ts_metric.labels(server_name=origin).set(newest_pdu_ts / 1000)
last_pdu_ts_metric.labels(
origin_server_name=origin, **{SERVER_NAME_LABEL: self.server_name}
).set(newest_pdu_ts / 1000)
Comment thread
MadLittleMods marked this conversation as resolved.

return pdu_results

Expand Down
10 changes: 6 additions & 4 deletions synapse/federation/sender/transaction_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
tags,
whitelisted_homeserver,
)
from synapse.metrics import SERVER_NAME_LABEL
from synapse.types import JsonDict
from synapse.util import json_decoder
from synapse.util.metrics import measure_func
Expand All @@ -47,7 +48,7 @@
last_pdu_ts_metric = Gauge(
"synapse_federation_last_sent_pdu_time",
"The timestamp of the last PDU which was successfully sent to the given domain",
labelnames=("server_name",),
labelnames=("destination_server_name", SERVER_NAME_LABEL),
Copy link
Copy Markdown
Contributor Author

@MadLittleMods MadLittleMods Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This metric was already using the server_name label so I've had to rename the label. I've updated the contrib/grafana/synapse.json but it's something we probably want to call out in the upgrade notes (added)

)


Expand Down Expand Up @@ -191,6 +192,7 @@ def json_data_cb() -> JsonDict:

if pdus and destination in self._federation_metrics_domains:
last_pdu = pdus[-1]
last_pdu_ts_metric.labels(server_name=destination).set(
last_pdu.origin_server_ts / 1000
)
last_pdu_ts_metric.labels(
destination_server_name=destination,
**{SERVER_NAME_LABEL: self.server_name},
).set(last_pdu.origin_server_ts / 1000)
59 changes: 42 additions & 17 deletions synapse/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,19 +488,27 @@ def collect(self) -> Iterable[Metric]:

# Used to track where various components have processed in the event stream,
# e.g. federation sending, appservice sending, etc.
event_processing_positions = Gauge("synapse_event_processing_positions", "", ["name"])
event_processing_positions = Gauge(
"synapse_event_processing_positions", "", labelnames=["name", SERVER_NAME_LABEL]
)

# Used to track the current max events stream position
event_persisted_position = Gauge("synapse_event_persisted_position", "")
event_persisted_position = Gauge(
"synapse_event_persisted_position", "", labelnames=[SERVER_NAME_LABEL]
)

# Used to track the received_ts of the last event processed by various
# components
event_processing_last_ts = Gauge("synapse_event_processing_last_ts", "", ["name"])
event_processing_last_ts = Gauge(
"synapse_event_processing_last_ts", "", labelnames=["name", SERVER_NAME_LABEL]
)

# Used to track the lag processing events. This is the time difference
# between the last processed event's received_ts and the time it was
# finished being processed.
event_processing_lag = Gauge("synapse_event_processing_lag", "", ["name"])
event_processing_lag = Gauge(
"synapse_event_processing_lag", "", labelnames=["name", SERVER_NAME_LABEL]
)

event_processing_lag_by_event = Histogram(
"synapse_event_processing_lag_by_event",
Expand All @@ -509,7 +517,11 @@ def collect(self) -> Iterable[Metric]:
)

# Build info of the running server.
build_info = Gauge(
#
# This is a process-level metric, so it does not have the `SERVER_NAME_LABEL`. We
# consider this process-level because all Synapse homeservers running in the process
# will use the same Synapse version.
build_info = Gauge( # type: ignore[missing-server-name-label]
"synapse_build_info", "Build information", ["pythonversion", "version", "osversion"]
)
build_info.labels(
Expand All @@ -531,38 +543,51 @@ def collect(self) -> Iterable[Metric]:
threadpool_total_threads = Gauge(
"synapse_threadpool_total_threads",
"Total number of threads currently in the threadpool",
["name"],
labelnames=["name", SERVER_NAME_LABEL],
)

threadpool_total_working_threads = Gauge(
"synapse_threadpool_working_threads",
"Number of threads currently working in the threadpool",
["name"],
labelnames=["name", SERVER_NAME_LABEL],
)

threadpool_total_min_threads = Gauge(
"synapse_threadpool_min_threads",
"Minimum number of threads configured in the threadpool",
["name"],
labelnames=["name", SERVER_NAME_LABEL],
)

threadpool_total_max_threads = Gauge(
"synapse_threadpool_max_threads",
"Maximum number of threads configured in the threadpool",
["name"],
labelnames=["name", SERVER_NAME_LABEL],
)


def register_threadpool(name: str, threadpool: ThreadPool) -> None:
"""Add metrics for the threadpool."""
def register_threadpool(*, name: str, server_name: str, threadpool: ThreadPool) -> None:
"""
Add metrics for the threadpool.

threadpool_total_min_threads.labels(name).set(threadpool.min)
threadpool_total_max_threads.labels(name).set(threadpool.max)
Args:
name: The name of the threadpool, used to identify it in the metrics.
server_name: The homeserver name (used to label metrics) (this should be `hs.hostname`).
threadpool: The threadpool to register metrics for.
"""

threadpool_total_threads.labels(name).set_function(lambda: len(threadpool.threads))
threadpool_total_working_threads.labels(name).set_function(
lambda: len(threadpool.working)
)
threadpool_total_min_threads.labels(
name=name, **{SERVER_NAME_LABEL: server_name}
).set(threadpool.min)
threadpool_total_max_threads.labels(
name=name, **{SERVER_NAME_LABEL: server_name}
).set(threadpool.max)

threadpool_total_threads.labels(
name=name, **{SERVER_NAME_LABEL: server_name}
).set_function(lambda: len(threadpool.threads))
threadpool_total_working_threads.labels(
name=name, **{SERVER_NAME_LABEL: server_name}
).set_function(lambda: len(threadpool.working))


class MetricsResource(Resource):
Expand Down
Loading
Loading