Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b14eed7
Add in base linting for metrics
MadLittleMods Jul 23, 2025
4bb84a6
Fill in `synapse/app/phone_stats_home.py`
MadLittleMods Jul 23, 2025
80d5fd5
Support `labelnames` argument being a Tuple expression
MadLittleMods Jul 23, 2025
1504100
Fill in `synapse/federation/federation_server.py`
MadLittleMods Jul 23, 2025
608f72e
Fill in `synapse/federation/sender/transaction_manager.py`
MadLittleMods Jul 23, 2025
0a2877e
Fill in `synapse/metrics/__init__.py`
MadLittleMods Jul 23, 2025
0c93d85
Fill in `synapse/metrics/_gc.py`
MadLittleMods Jul 23, 2025
fe1a16f
Fill in `synapse/metrics/common_usage_metrics.py`
MadLittleMods Jul 23, 2025
944df9c
Fill in `synapse/push/pusherpool.py`
MadLittleMods Jul 23, 2025
54e2374
Fill in `synapse/replication/http/_base.py`
MadLittleMods Jul 23, 2025
883062b
Fill in `synapse/storage/databases/main/event_federation.py`
MadLittleMods Jul 23, 2025
90efa41
Fill in `synapse/storage/databases/main/events_worker.py`
MadLittleMods Jul 23, 2025
f4b6d35
Fill in `synapse/util/batching_queue.py`
MadLittleMods Jul 23, 2025
656c3ad
Fill in `synapse/util/caches/deferred_cache.py`
MadLittleMods Jul 23, 2025
c55e615
Add changelog
MadLittleMods Jul 23, 2025
0fd34a6
Add upgrade notes
MadLittleMods Jul 23, 2025
563f543
Fix `make_fake_db_pool`
MadLittleMods Jul 23, 2025
554d588
Fix `event_persisted_position` usage
MadLittleMods Jul 23, 2025
2536aaf
Fix lints
MadLittleMods Jul 23, 2025
7b55ffb
Fill in event `Gauge` metrics from `synapse/metrics/__init__.py`
MadLittleMods Jul 24, 2025
68061f9
Merge branch 'develop' into madlittlemods/18592-refactor-gauge
MadLittleMods Jul 24, 2025
d587aa9
Remove debug log
MadLittleMods Jul 24, 2025
def4eb5
Merge branch 'develop' into madlittlemods/18592-refactor-gauge
MadLittleMods Jul 25, 2025
650ce32
Fix grammar
MadLittleMods Jul 25, 2025
4387262
Fix duplicate word typo
MadLittleMods Jul 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/18725.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refactor `Gauge` metrics to be homeserver-scoped.
4 changes: 2 additions & 2 deletions contrib/grafana/synapse.json
Original file line number Diff line number Diff line change
Expand Up @@ -4396,7 +4396,7 @@
"exemplar": false,
"expr": "(time() - max without (job, index, host) (avg_over_time(synapse_federation_last_received_pdu_time[10m]))) / 60",
"instant": false,
"legendFormat": "{{server_name}} ",
"legendFormat": "{{origin_server_name}} ",
"range": true,
"refId": "A"
}
Expand Down Expand Up @@ -4518,7 +4518,7 @@
"exemplar": false,
"expr": "(time() - max without (job, index, host) (avg_over_time(synapse_federation_last_sent_pdu_time[10m]))) / 60",
"instant": false,
"legendFormat": "{{server_name}}",
"legendFormat": "{{destination_server_name}}",
"range": true,
"refId": "A"
}
Expand Down
19 changes: 19 additions & 0 deletions docs/upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,25 @@ each upgrade are complete before moving on to the next upgrade, to avoid
stacking them up. You can monitor the currently running background updates with
[the Admin API](usage/administration/admin_api/background_updates.html#status).

# Upgrading to v1.136.0

## Metric labels have changed on `synapse_federation_last_received_pdu_time` and `synapse_federation_last_sent_pdu_time`

Previously, the `synapse_federation_last_received_pdu_time` and
`synapse_federation_last_sent_pdu_time` metrics both used the `server_name` label to
differentiate between different servers that we send and receive events from.

Since we're now using the `server_name` label to differentiate between different Synapse
homeserver instances running in the same process, these metrics have been changed as follows:

- `synapse_federation_last_received_pdu_time` now uses the `origin_server_name` label
- `synapse_federation_last_sent_pdu_time` now uses the `destination_server_name` label

The Grafana dashboard JSON in `contrib/grafana/synapse.json` has been updated to reflect
this change but you will need to manually update your own existing Grafana dashboards
using these metrics.


# Upgrading to v1.135.0

## `on_user_registration` module API callback may now run on any worker
Expand Down
106 changes: 104 additions & 2 deletions scripts-dev/mypy_synapse_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,13 @@
import mypy.types
from mypy.erasetype import remove_instance_last_known_values
from mypy.errorcodes import ErrorCode
from mypy.nodes import ARG_NAMED_OPT, TempNode, Var
from mypy.plugin import FunctionSigContext, MethodSigContext, Plugin
from mypy.nodes import ARG_NAMED_OPT, ListExpr, NameExpr, TempNode, TupleExpr, Var
from mypy.plugin import (
FunctionLike,
FunctionSigContext,
MethodSigContext,
Plugin,
)
from mypy.typeops import bind_self
from mypy.types import (
AnyType,
Expand All @@ -43,8 +48,26 @@
UnionType,
)

PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL = ErrorCode(
"missing-server-name-label",
"`SERVER_NAME_LABEL` required in metric",
category="per-homeserver-tenant-metrics",
)


class SynapsePlugin(Plugin):
def get_function_signature_hook(
self, fullname: str
) -> Optional[Callable[[FunctionSigContext], FunctionLike]]:
if fullname in (
"prometheus_client.metrics.Gauge",
# TODO: Add other prometheus_client metrics that need checking as we
# refactor, see https://github.com/element-hq/synapse/issues/18592
):
return check_prometheus_metric_instantiation

return None

def get_method_signature_hook(
self, fullname: str
) -> Optional[Callable[[MethodSigContext], CallableType]]:
Expand All @@ -65,6 +88,85 @@ def get_method_signature_hook(
return None


def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableType:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for adding this lint!
This should make maintenance of these changes much more manageable going forward.

"""
Ensure that the `prometheus_client` metrics include the `SERVER_NAME_LABEL` label
when instantiated.

This is important because we support multiple Synapse instances running in the same
process, where all metrics share a single global `REGISTRY`. The `server_name` label
ensures metrics are correctly separated by homeserver.

There are also some metrics that apply at the process level, such as CPU usage,
Python garbage collection, Twisted reactor tick time which shouldn't have the
Comment thread
MadLittleMods marked this conversation as resolved.
Outdated
`SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the
Comment thread
MadLittleMods marked this conversation as resolved.
Outdated
check, e.g. `# type: ignore[missing-server-name-label]`.
"""
# The true signature, this isn't being modified so this is what will be returned.
signature: CallableType = ctx.default_signature

# Sanity check the arguments are still as expected in this version of
# `prometheus_client`. ex. `Counter(name, documentation, labelnames, ...)`
#
# `signature.arg_names` should be: ["name", "documentation", "labelnames", ...]
if len(signature.arg_names) < 3 or signature.arg_names[2] != "labelnames":
ctx.api.fail(
f"Expected the 3rd argument of {signature.name} to be 'labelnames', but got "
f"{signature.arg_names[2]}",
ctx.context,
)
return signature

# Ensure mypy is passing the correct number of arguments because we are doing some
# dirty indexing into `ctx.args` later on.
assert len(ctx.args) == len(signature.arg_names), (
f"Expected the list of arguments in the {signature.name} signature ({len(signature.arg_names)})"
f"to match the number of arguments from the function signature context ({len(ctx.args)})"
)

# Check if the `labelnames` argument includes `SERVER_NAME_LABEL`
#
# `ctx.args` should look like this:
# ```
# [
# [StrExpr("name")],
# [StrExpr("documentation")],
# [ListExpr([StrExpr("label1"), StrExpr("label2")])]
# ...
# ]
# ```
labelnames_arg_expression = ctx.args[2][0] if len(ctx.args[2]) > 0 else None
if isinstance(labelnames_arg_expression, (ListExpr, TupleExpr)):
# Check if the `labelnames` argument includes the `server_name` label (`SERVER_NAME_LABEL`).
for labelname_expression in labelnames_arg_expression.items:
if (
isinstance(labelname_expression, NameExpr)
and labelname_expression.fullname == "synapse.metrics.SERVER_NAME_LABEL"
):
# Found the `SERVER_NAME_LABEL`, all good!
break
else:
ctx.api.fail(
f"Expected {signature.name} to include `SERVER_NAME_LABEL` in the list of labels. "
"If this is a process-level metric (vs homeserver-level), use a type ignore comment "
"to disable this check.",
ctx.context,
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
)
else:
ctx.api.fail(
f"Expected the `labelnames` argument of {signature.name} to be a list of label names "
f"(including `SERVER_NAME_LABEL`), but got {labelnames_arg_expression}. "
"If this is a process-level metric (vs homeserver-level), use a type ignore comment "
"to disable this check.",
ctx.context,
code=PROMETHEUS_METRIC_MISSING_SERVER_NAME_LABEL,
)
return signature

return signature


def _get_true_return_type(signature: CallableType) -> mypy.types.Type:
"""
Get the "final" return type of a callable which might return an Awaitable/Deferred.
Expand Down
8 changes: 6 additions & 2 deletions synapse/app/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,8 +525,12 @@ async def start(hs: "HomeServer") -> None:
)

# Register the threadpools with our metrics.
register_threadpool("default", reactor.getThreadPool())
register_threadpool("gai_resolver", resolver_threadpool)
register_threadpool(
name="default", server_name=server_name, threadpool=reactor.getThreadPool()
)
register_threadpool(
name="gai_resolver", server_name=server_name, threadpool=resolver_threadpool
)

# Set up the SIGHUP machinery.
if hasattr(signal, "SIGHUP"):
Expand Down
34 changes: 26 additions & 8 deletions synapse/app/phone_stats_home.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from twisted.internet import defer

from synapse.metrics import SERVER_NAME_LABEL
from synapse.metrics.background_process_metrics import (
run_as_background_process,
)
Expand Down Expand Up @@ -57,16 +58,25 @@
_stats_process: List[Tuple[int, "resource.struct_rusage"]] = []

# Gauges to expose monthly active user control metrics
current_mau_gauge = Gauge("synapse_admin_mau_current", "Current MAU")
current_mau_gauge = Gauge(
"synapse_admin_mau_current",
"Current MAU",
labelnames=[SERVER_NAME_LABEL],
)
current_mau_by_service_gauge = Gauge(
"synapse_admin_mau_current_mau_by_service",
"Current MAU by service",
["app_service"],
labelnames=["app_service", SERVER_NAME_LABEL],
)
max_mau_gauge = Gauge(
"synapse_admin_mau_max",
"MAU Limit",
labelnames=[SERVER_NAME_LABEL],
)
max_mau_gauge = Gauge("synapse_admin_mau_max", "MAU Limit")
registered_reserved_users_mau_gauge = Gauge(
"synapse_admin_mau_registered_reserved_users",
"Registered users with reserved threepids",
labelnames=[SERVER_NAME_LABEL],
)


Expand Down Expand Up @@ -237,13 +247,21 @@ async def _generate_monthly_active_users() -> None:
await store.get_monthly_active_count_by_service()
)
reserved_users = await store.get_registered_reserved_users()
current_mau_gauge.set(float(current_mau_count))
current_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set(
float(current_mau_count)
)

for app_service, count in current_mau_count_by_service.items():
current_mau_by_service_gauge.labels(app_service).set(float(count))

registered_reserved_users_mau_gauge.set(float(len(reserved_users)))
max_mau_gauge.set(float(hs.config.server.max_mau_value))
current_mau_by_service_gauge.labels(
app_service=app_service, **{SERVER_NAME_LABEL: server_name}
).set(float(count))

registered_reserved_users_mau_gauge.labels(
**{SERVER_NAME_LABEL: server_name}
).set(float(len(reserved_users)))
max_mau_gauge.labels(**{SERVER_NAME_LABEL: server_name}).set(
float(hs.config.server.max_mau_value)
)

return run_as_background_process(
"generate_monthly_active_users",
Expand Down
7 changes: 5 additions & 2 deletions synapse/federation/federation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
tag_args,
trace,
)
from synapse.metrics import SERVER_NAME_LABEL
from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.replication.http.federation import (
ReplicationFederationSendEduRestServlet,
Expand Down Expand Up @@ -120,7 +121,7 @@
last_pdu_ts_metric = Gauge(
"synapse_federation_last_received_pdu_time",
"The timestamp of the last PDU which was successfully received from the given domain",
labelnames=("server_name",),
labelnames=("origin_server_name", SERVER_NAME_LABEL),
Copy link
Copy Markdown
Contributor Author

@MadLittleMods MadLittleMods Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This metric was already using the server_name label so I've had to rename the label. I've updated the contrib/grafana/synapse.json but it's something we probably want to call out in the upgrade notes (added)

)


Expand Down Expand Up @@ -545,7 +546,9 @@ async def process_pdu(pdu: EventBase) -> JsonDict:
)

if newest_pdu_ts and origin in self._federation_metrics_domains:
last_pdu_ts_metric.labels(server_name=origin).set(newest_pdu_ts / 1000)
last_pdu_ts_metric.labels(
origin_server_name=origin, **{SERVER_NAME_LABEL: self.server_name}
).set(newest_pdu_ts / 1000)
Comment thread
MadLittleMods marked this conversation as resolved.

return pdu_results

Expand Down
9 changes: 6 additions & 3 deletions synapse/federation/sender/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@
from synapse.federation.units import Edu
from synapse.logging.context import make_deferred_yieldable, run_in_background
from synapse.metrics import (
SERVER_NAME_LABEL,
LaterGauge,
event_processing_loop_counter,
event_processing_loop_room_count,
Expand Down Expand Up @@ -702,10 +703,12 @@ async def handle_room_events(events: List[EventBase]) -> None:
assert ts is not None

synapse.metrics.event_processing_lag.labels(
"federation_sender"
name="federation_sender",
**{SERVER_NAME_LABEL: self.server_name},
).set(now - ts)
synapse.metrics.event_processing_last_ts.labels(
"federation_sender"
name="federation_sender",
**{SERVER_NAME_LABEL: self.server_name},
).set(ts)

events_processed_counter.inc(len(event_entries))
Expand All @@ -717,7 +720,7 @@ async def handle_room_events(events: List[EventBase]) -> None:
event_processing_loop_counter.labels("federation_sender").inc()

synapse.metrics.event_processing_positions.labels(
"federation_sender"
name="federation_sender", **{SERVER_NAME_LABEL: self.server_name}
).set(next_token)

finally:
Expand Down
10 changes: 6 additions & 4 deletions synapse/federation/sender/transaction_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
tags,
whitelisted_homeserver,
)
from synapse.metrics import SERVER_NAME_LABEL
from synapse.types import JsonDict
from synapse.util import json_decoder
from synapse.util.metrics import measure_func
Expand All @@ -47,7 +48,7 @@
last_pdu_ts_metric = Gauge(
"synapse_federation_last_sent_pdu_time",
"The timestamp of the last PDU which was successfully sent to the given domain",
labelnames=("server_name",),
labelnames=("destination_server_name", SERVER_NAME_LABEL),
Copy link
Copy Markdown
Contributor Author

@MadLittleMods MadLittleMods Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This metric was already using the server_name label so I've had to rename the label. I've updated the contrib/grafana/synapse.json but it's something we probably want to call out in the upgrade notes (added)

)


Expand Down Expand Up @@ -191,6 +192,7 @@ def json_data_cb() -> JsonDict:

if pdus and destination in self._federation_metrics_domains:
last_pdu = pdus[-1]
last_pdu_ts_metric.labels(server_name=destination).set(
last_pdu.origin_server_ts / 1000
)
last_pdu_ts_metric.labels(
destination_server_name=destination,
**{SERVER_NAME_LABEL: self.server_name},
).set(last_pdu.origin_server_ts / 1000)
10 changes: 7 additions & 3 deletions synapse/handlers/appservice.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from synapse.handlers.presence import format_user_presence_state
from synapse.logging.context import make_deferred_yieldable, run_in_background
from synapse.metrics import (
SERVER_NAME_LABEL,
event_processing_loop_counter,
event_processing_loop_room_count,
)
Expand Down Expand Up @@ -204,7 +205,8 @@ async def handle_room_events(events: Iterable[EventBase]) -> None:
await self.store.set_appservice_last_pos(upper_bound)

synapse.metrics.event_processing_positions.labels(
"appservice_sender"
name="appservice_sender",
**{SERVER_NAME_LABEL: self.server_name},
).set(upper_bound)

events_processed_counter.inc(len(events))
Expand All @@ -221,10 +223,12 @@ async def handle_room_events(events: Iterable[EventBase]) -> None:
assert ts is not None

synapse.metrics.event_processing_lag.labels(
"appservice_sender"
name="appservice_sender",
**{SERVER_NAME_LABEL: self.server_name},
).set(now - ts)
synapse.metrics.event_processing_last_ts.labels(
"appservice_sender"
name="appservice_sender",
**{SERVER_NAME_LABEL: self.server_name},
).set(ts)
finally:
self.is_processing = False
Expand Down
6 changes: 4 additions & 2 deletions synapse/handlers/delayed_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from synapse.api.ratelimiting import Ratelimiter
from synapse.config.workers import MAIN_PROCESS_INSTANCE_NAME
from synapse.logging.opentracing import set_tag
from synapse.metrics import event_processing_positions
from synapse.metrics import SERVER_NAME_LABEL, event_processing_positions
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.replication.http.delayed_events import (
ReplicationAddedDelayedEventRestServlet,
Expand Down Expand Up @@ -191,7 +191,9 @@ async def _unsafe_process_new_event(self) -> None:
self._event_pos = max_pos

# Expose current event processing position to prometheus
event_processing_positions.labels("delayed_events").set(max_pos)
event_processing_positions.labels(
name="delayed_events", **{SERVER_NAME_LABEL: self.server_name}
).set(max_pos)

await self._store.update_delayed_events_stream_pos(max_pos)

Expand Down
Loading
Loading