Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
285f192
Fix `LaterGauge` metrics to collect from all servers (#18751)
MadLittleMods Aug 5, 2025
4dda2b1
First pass: keep track of hooks per server
MadLittleMods Aug 7, 2025
505263d
Some fix-ups
MadLittleMods Aug 7, 2025
46e4053
Add `server_name` to `register_hook`
MadLittleMods Aug 7, 2025
2e5ac44
Move metric clean-up to catch all servers
MadLittleMods Aug 7, 2025
2f25722
Update changelog number
MadLittleMods Aug 7, 2025
8b98536
Merge branch 'develop' into madlittlemods/re-introduce-18751
MadLittleMods Aug 7, 2025
a34122f
Try fix `synapse/_scripts/synapse_port_db.py` script
MadLittleMods Aug 7, 2025
b226f63
Cleanup homeservers when we `synapse/_scripts/generate_workers_map.py`
MadLittleMods Aug 7, 2025
75e7463
Fix lints
MadLittleMods Aug 7, 2025
68bb036
Use `instance_id` instead of `server_name` to track metrics
MadLittleMods Aug 7, 2025
87cc52f
Fix `BaseStreamTestCase`
MadLittleMods Aug 7, 2025
c7d1a78
Fix lints
MadLittleMods Aug 7, 2025
14aee2f
Fix multiple databases registering metrics
MadLittleMods Aug 8, 2025
5baa576
Merge branch 'develop' into madlittlemods/re-introduce-18751
MadLittleMods Aug 12, 2025
d7946f4
Merge branch 'develop' into madlittlemods/re-introduce-18751
MadLittleMods Aug 19, 2025
635dcce
Merge branch 'develop' into madlittlemods/re-introduce-18751
MadLittleMods Aug 27, 2025
59701ed
Create consistent `instance_id` in `MockHomeserver`
MadLittleMods Aug 27, 2025
309a72d
Note when `cleanup` should be called
MadLittleMods Aug 27, 2025
371aa51
`cleanup` when homeserver is garbage collected
MadLittleMods Aug 27, 2025
08755ae
Only yield the metric once when we `collect`
MadLittleMods Aug 27, 2025
31ad15a
Continue to return metrics that aren't broken
MadLittleMods Aug 27, 2025
b4f06b2
Add tests to ensure we get all metrics even if one hook throws exception
MadLittleMods Aug 27, 2025
9e07d37
Merge branch 'develop' into madlittlemods/re-introduce-18751
MadLittleMods Sep 2, 2025
1c1d6c2
Don't double yield the same gauge
MadLittleMods Sep 2, 2025
595c174
Fix `generate_workers_map` script erroring out
MadLittleMods Sep 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions synapse/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,17 @@ def collect(self) -> Iterable[Metric]:
# (we don't enforce it here, one level up).
g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames) # type: ignore[missing-server-name-label]

for hook in self._instance_id_to_hook_map.values():
for homeserver_instance_id, hook in self._instance_id_to_hook_map.items():
try:
hook_result = hook()
except Exception:
logger.exception(
"Exception running callback for LaterGauge(%s)", self.name
"Exception running callback for LaterGauge(%s) for homeserver_instance_id=%s",
self.name,
homeserver_instance_id,
)
yield g
Comment thread
MadLittleMods marked this conversation as resolved.
Outdated
# Continue to return metrics that aren't broken
# Continue to return the rest of the metrics that aren't broken
continue

if isinstance(hook_result, (int, float)):
Expand Down
76 changes: 63 additions & 13 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# [This file includes modifications made by New Vector Limited]
#
#
from typing import Dict, Protocol, Tuple
from typing import Dict, NoReturn, Protocol, Tuple

from prometheus_client.core import Sample

Expand All @@ -27,6 +27,7 @@
SERVER_NAME_LABEL,
InFlightGauge,
LaterGauge,
all_later_gauges_to_clean_up_on_shutdown,
generate_latest,
)
from synapse.util.caches.deferred_cache import DeferredCache
Expand Down Expand Up @@ -292,42 +293,91 @@ def test_cache_metric_multiple_servers(self) -> None:


class LaterGaugeTests(unittest.HomeserverTestCase):
def setUp(self) -> None:
super().setUp()
self.later_gauge = LaterGauge(
name="foo",
desc="",
labelnames=[SERVER_NAME_LABEL],
)

def tearDown(self) -> None:
super().tearDown()

REGISTRY.unregister(self.later_gauge)
all_later_gauges_to_clean_up_on_shutdown.pop(self.later_gauge.name, None)

def test_later_gauge_multiple_servers(self) -> None:
"""
Test that LaterGauge metrics are reported correctly across multiple servers. We
will have an metrics entry for each homeserver that is labeled with the
`server_name` label.
"""
later_gauge = LaterGauge(
name="foo",
desc="",
labelnames=[SERVER_NAME_LABEL],
)
later_gauge.register_hook(
self.later_gauge.register_hook(
homeserver_instance_id="123", hook=lambda: {("hs1",): 1}
)
later_gauge.register_hook(
self.later_gauge.register_hook(
homeserver_instance_id="456", hook=lambda: {("hs2",): 2}
)

metrics_map = get_latest_metrics()

# Find the metrics for the caches from both homeservers
# Find the metrics from both homeservers
hs1_metric = 'foo{server_name="hs1"}'
hs1_metric_value = metrics_map.get(hs1_metric)
self.assertIsNotNone(
hs1_metric_value,
f"Missing metric {hs1_metric} in cache metrics {metrics_map}",
f"Missing metric {hs1_metric} in metrics {metrics_map}",
)
self.assertEqual(hs1_metric_value, "1.0")

hs2_metric = 'foo{server_name="hs2"}'
hs2_metric_value = metrics_map.get(hs2_metric)
self.assertIsNotNone(
hs2_metric_value,
f"Missing metric {hs2_metric} in cache metrics {metrics_map}",
f"Missing metric {hs2_metric} in metrics {metrics_map}",
)
self.assertEqual(hs2_metric_value, "2.0")

# Sanity check the metric values
self.assertEqual(hs1_metric_value, "1.0")
def test_later_gauge_hook_exception(self) -> None:
"""
Test that LaterGauge metrics are collected across multiple servers even if one
hooks is throwing an exception.
"""

def raise_exception() -> NoReturn:
raise Exception("fake error generating data")

# Make the hook for hs1 throw an exception
self.later_gauge.register_hook(
homeserver_instance_id="123", hook=raise_exception
)
# Metrics from hs2 still work fine
self.later_gauge.register_hook(
homeserver_instance_id="456", hook=lambda: {("hs2",): 2}
)

metrics_map = get_latest_metrics()

# Since we encountered an exception while trying to collect metrics from hs1, we
# don't expect to see it here.
hs1_metric = 'foo{server_name="hs1"}'
hs1_metric_value = metrics_map.get(hs1_metric)
self.assertIsNone(
hs1_metric_value,
(
"Since we encountered an exception while trying to collect metrics from hs1"
f"we don't expect to see it the metrics_map {metrics_map}"
),
)

# We should still see metrics from hs2 though
hs2_metric = 'foo{server_name="hs2"}'
hs2_metric_value = metrics_map.get(hs2_metric)
self.assertIsNotNone(
hs2_metric_value,
f"Missing metric {hs2_metric} in cache metrics {metrics_map}",
)
self.assertEqual(hs2_metric_value, "2.0")


Expand Down
Loading