Skip to content

Commit b4f06b2

Browse files
committed
Add tests to ensure we get all metrics even if one hook throws exception
1 parent 31ad15a commit b4f06b2

2 files changed

Lines changed: 68 additions & 16 deletions

File tree

synapse/metrics/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,17 @@ def collect(self) -> Iterable[Metric]:
181181
# (we don't enforce it here, one level up).
182182
g = GaugeMetricFamily(self.name, self.desc, labels=self.labelnames) # type: ignore[missing-server-name-label]
183183

184-
for hook in self._instance_id_to_hook_map.values():
184+
for homeserver_instance_id, hook in self._instance_id_to_hook_map.items():
185185
try:
186186
hook_result = hook()
187187
except Exception:
188188
logger.exception(
189-
"Exception running callback for LaterGauge(%s)", self.name
189+
"Exception running callback for LaterGauge(%s) for homeserver_instance_id=%s",
190+
self.name,
191+
homeserver_instance_id,
190192
)
191193
yield g
192-
# Continue to return metrics that aren't broken
194+
# Continue to return the rest of the metrics that aren't broken
193195
continue
194196

195197
if isinstance(hook_result, (int, float)):

tests/metrics/test_metrics.py

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
# [This file includes modifications made by New Vector Limited]
1919
#
2020
#
21-
from typing import Dict, Protocol, Tuple
21+
from typing import Dict, NoReturn, Protocol, Tuple
2222

2323
from prometheus_client.core import Sample
2424

@@ -27,6 +27,7 @@
2727
SERVER_NAME_LABEL,
2828
InFlightGauge,
2929
LaterGauge,
30+
all_later_gauges_to_clean_up_on_shutdown,
3031
generate_latest,
3132
)
3233
from synapse.util.caches.deferred_cache import DeferredCache
@@ -292,42 +293,91 @@ def test_cache_metric_multiple_servers(self) -> None:
292293

293294

294295
class LaterGaugeTests(unittest.HomeserverTestCase):
296+
def setUp(self) -> None:
297+
super().setUp()
298+
self.later_gauge = LaterGauge(
299+
name="foo",
300+
desc="",
301+
labelnames=[SERVER_NAME_LABEL],
302+
)
303+
304+
def tearDown(self) -> None:
305+
super().tearDown()
306+
307+
REGISTRY.unregister(self.later_gauge)
308+
all_later_gauges_to_clean_up_on_shutdown.pop(self.later_gauge.name, None)
309+
295310
def test_later_gauge_multiple_servers(self) -> None:
296311
"""
297312
Test that LaterGauge metrics are reported correctly across multiple servers. We
298313
will have an metrics entry for each homeserver that is labeled with the
299314
`server_name` label.
300315
"""
301-
later_gauge = LaterGauge(
302-
name="foo",
303-
desc="",
304-
labelnames=[SERVER_NAME_LABEL],
305-
)
306-
later_gauge.register_hook(
316+
self.later_gauge.register_hook(
307317
homeserver_instance_id="123", hook=lambda: {("hs1",): 1}
308318
)
309-
later_gauge.register_hook(
319+
self.later_gauge.register_hook(
310320
homeserver_instance_id="456", hook=lambda: {("hs2",): 2}
311321
)
312322

313323
metrics_map = get_latest_metrics()
314324

315-
# Find the metrics for the caches from both homeservers
325+
# Find the metrics from both homeservers
316326
hs1_metric = 'foo{server_name="hs1"}'
317327
hs1_metric_value = metrics_map.get(hs1_metric)
318328
self.assertIsNotNone(
319329
hs1_metric_value,
320-
f"Missing metric {hs1_metric} in cache metrics {metrics_map}",
330+
f"Missing metric {hs1_metric} in metrics {metrics_map}",
321331
)
332+
self.assertEqual(hs1_metric_value, "1.0")
333+
322334
hs2_metric = 'foo{server_name="hs2"}'
323335
hs2_metric_value = metrics_map.get(hs2_metric)
324336
self.assertIsNotNone(
325337
hs2_metric_value,
326-
f"Missing metric {hs2_metric} in cache metrics {metrics_map}",
338+
f"Missing metric {hs2_metric} in metrics {metrics_map}",
327339
)
340+
self.assertEqual(hs2_metric_value, "2.0")
328341

329-
# Sanity check the metric values
330-
self.assertEqual(hs1_metric_value, "1.0")
342+
def test_later_gauge_hook_exception(self) -> None:
343+
"""
344+
Test that LaterGauge metrics are collected across multiple servers even if one
345+
hooks is throwing an exception.
346+
"""
347+
348+
def raise_exception() -> NoReturn:
349+
raise Exception("fake error generating data")
350+
351+
# Make the hook for hs1 throw an exception
352+
self.later_gauge.register_hook(
353+
homeserver_instance_id="123", hook=raise_exception
354+
)
355+
# Metrics from hs2 still work fine
356+
self.later_gauge.register_hook(
357+
homeserver_instance_id="456", hook=lambda: {("hs2",): 2}
358+
)
359+
360+
metrics_map = get_latest_metrics()
361+
362+
# Since we encountered an exception while trying to collect metrics from hs1, we
363+
# don't expect to see it here.
364+
hs1_metric = 'foo{server_name="hs1"}'
365+
hs1_metric_value = metrics_map.get(hs1_metric)
366+
self.assertIsNone(
367+
hs1_metric_value,
368+
(
369+
"Since we encountered an exception while trying to collect metrics from hs1"
370+
f"we don't expect to see it the metrics_map {metrics_map}"
371+
),
372+
)
373+
374+
# We should still see metrics from hs2 though
375+
hs2_metric = 'foo{server_name="hs2"}'
376+
hs2_metric_value = metrics_map.get(hs2_metric)
377+
self.assertIsNotNone(
378+
hs2_metric_value,
379+
f"Missing metric {hs2_metric} in cache metrics {metrics_map}",
380+
)
331381
self.assertEqual(hs2_metric_value, "2.0")
332382

333383

0 commit comments

Comments
 (0)