Skip to content

Commit e43a1ce

Browse files
Fix cache metrics to collect from all servers (#18748)
Follow-up to #18604 Previously, our cache metrics did include the `server_name` label as expected but we were only seeing the last server being reported. This was caused because we would `CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)` where the `metric_name` only took into account the cache name so it would be overwritten every time we spawn a new server. This PR updates the register logic to include the `server_name` so we have a hook for every cache on every server as expected. I noticed this problem thanks to some [tests in the Synapse Pro for Small Hosts](element-hq/synapse-small-hosts#173) repo that sanity check all metrics to ensure that we can see each metric includes data from multiple servers.
1 parent 510924a commit e43a1ce

3 files changed

Lines changed: 143 additions & 22 deletions

File tree

changelog.d/18748.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Refactor cache metrics to be homeserver-scoped.

synapse/util/caches/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@
4444
# just before they are returned from the scrape endpoint.
4545
CACHE_METRIC_REGISTRY = DynamicCollectorRegistry()
4646

47-
caches_by_name: Dict[str, Sized] = {}
48-
4947
cache_size = Gauge(
5048
"synapse_util_caches_cache_size",
5149
"",
@@ -242,8 +240,7 @@ def register_cache(
242240
server_name=server_name,
243241
collect_callback=collect_callback,
244242
)
245-
metric_name = "cache_%s_%s" % (cache_type, cache_name)
246-
caches_by_name[cache_name] = cache
243+
metric_name = "cache_%s_%s_%s" % (cache_type, cache_name, server_name)
247244
CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)
248245
return metric
249246

tests/metrics/test_metrics.py

Lines changed: 141 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -159,26 +159,149 @@ def test_cache_metric(self) -> None:
159159
name=CACHE_NAME, server_name=self.hs.hostname, max_entries=777
160160
)
161161

162-
items = {
163-
x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
164-
for x in filter(
165-
lambda x: b"cache_metrics_test_fgjkbdfg" in x,
166-
generate_latest(REGISTRY).split(b"\n"),
167-
)
168-
}
162+
metrics_map = get_latest_metrics()
163+
164+
cache_size_metric = f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}'
165+
cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="{self.hs.hostname}"}}'
166+
167+
cache_size_metric_value = metrics_map.get(cache_size_metric)
168+
self.assertIsNotNone(
169+
cache_size_metric_value,
170+
f"Missing metric {cache_size_metric} in cache metrics {metrics_map}",
171+
)
172+
cache_max_size_metric_value = metrics_map.get(cache_max_size_metric)
173+
self.assertIsNotNone(
174+
cache_max_size_metric_value,
175+
f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}",
176+
)
169177

170-
self.assertEqual(items["synapse_util_caches_cache_size"], "0.0")
171-
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
178+
self.assertEqual(cache_size_metric_value, "0.0")
179+
self.assertEqual(cache_max_size_metric_value, "777.0")
172180

173181
cache.prefill("1", "hi")
174182

175-
items = {
176-
x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
177-
for x in filter(
178-
lambda x: b"cache_metrics_test_fgjkbdfg" in x,
179-
generate_latest(REGISTRY).split(b"\n"),
180-
)
181-
}
183+
metrics_map = get_latest_metrics()
184+
185+
cache_size_metric_value = metrics_map.get(cache_size_metric)
186+
self.assertIsNotNone(
187+
cache_size_metric_value,
188+
f"Missing metric {cache_size_metric} in cache metrics {metrics_map}",
189+
)
190+
cache_max_size_metric_value = metrics_map.get(cache_max_size_metric)
191+
self.assertIsNotNone(
192+
cache_max_size_metric_value,
193+
f"Missing metric {cache_max_size_metric} in cache metrics {metrics_map}",
194+
)
195+
196+
self.assertEqual(cache_size_metric_value, "1.0")
197+
self.assertEqual(cache_max_size_metric_value, "777.0")
198+
199+
def test_cache_metric_multiple_servers(self) -> None:
200+
"""
201+
Test that cache metrics are reported correctly across multiple servers. We will
202+
have an metrics entry for each homeserver that is labeled with the `server_name`
203+
label.
204+
"""
205+
CACHE_NAME = "cache_metric_multiple_servers_test"
206+
cache1: DeferredCache[str, str] = DeferredCache(
207+
name=CACHE_NAME, server_name="hs1", max_entries=777
208+
)
209+
cache2: DeferredCache[str, str] = DeferredCache(
210+
name=CACHE_NAME, server_name="hs2", max_entries=777
211+
)
212+
213+
metrics_map = get_latest_metrics()
214+
215+
hs1_cache_size_metric = (
216+
f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs1"}}'
217+
)
218+
hs2_cache_size_metric = (
219+
f'synapse_util_caches_cache_size{{name="{CACHE_NAME}",server_name="hs2"}}'
220+
)
221+
hs1_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs1"}}'
222+
hs2_cache_max_size_metric = f'synapse_util_caches_cache_max_size{{name="{CACHE_NAME}",server_name="hs2"}}'
223+
224+
# Find the metrics for the caches from both homeservers
225+
hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric)
226+
self.assertIsNotNone(
227+
hs1_cache_size_metric_value,
228+
f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}",
229+
)
230+
hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric)
231+
self.assertIsNotNone(
232+
hs2_cache_size_metric_value,
233+
f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}",
234+
)
235+
hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric)
236+
self.assertIsNotNone(
237+
hs1_cache_max_size_metric_value,
238+
f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}",
239+
)
240+
hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric)
241+
self.assertIsNotNone(
242+
hs2_cache_max_size_metric_value,
243+
f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}",
244+
)
245+
246+
# Sanity check the metric values
247+
self.assertEqual(hs1_cache_size_metric_value, "0.0")
248+
self.assertEqual(hs2_cache_size_metric_value, "0.0")
249+
self.assertEqual(hs1_cache_max_size_metric_value, "777.0")
250+
self.assertEqual(hs2_cache_max_size_metric_value, "777.0")
251+
252+
# Add something to both caches to change the numbers
253+
cache1.prefill("1", "hi")
254+
cache2.prefill("2", "ho")
255+
256+
metrics_map = get_latest_metrics()
257+
258+
# Find the metrics for the caches from both homeservers
259+
hs1_cache_size_metric_value = metrics_map.get(hs1_cache_size_metric)
260+
self.assertIsNotNone(
261+
hs1_cache_size_metric_value,
262+
f"Missing metric {hs1_cache_size_metric} in cache metrics {metrics_map}",
263+
)
264+
hs2_cache_size_metric_value = metrics_map.get(hs2_cache_size_metric)
265+
self.assertIsNotNone(
266+
hs2_cache_size_metric_value,
267+
f"Missing metric {hs2_cache_size_metric} in cache metrics {metrics_map}",
268+
)
269+
hs1_cache_max_size_metric_value = metrics_map.get(hs1_cache_max_size_metric)
270+
self.assertIsNotNone(
271+
hs1_cache_max_size_metric_value,
272+
f"Missing metric {hs1_cache_max_size_metric} in cache metrics {metrics_map}",
273+
)
274+
hs2_cache_max_size_metric_value = metrics_map.get(hs2_cache_max_size_metric)
275+
self.assertIsNotNone(
276+
hs2_cache_max_size_metric_value,
277+
f"Missing metric {hs2_cache_max_size_metric} in cache metrics {metrics_map}",
278+
)
279+
280+
# Sanity check the metric values
281+
self.assertEqual(hs1_cache_size_metric_value, "1.0")
282+
self.assertEqual(hs2_cache_size_metric_value, "1.0")
283+
self.assertEqual(hs1_cache_max_size_metric_value, "777.0")
284+
self.assertEqual(hs2_cache_max_size_metric_value, "777.0")
285+
286+
287+
def get_latest_metrics() -> Dict[str, str]:
288+
"""
289+
Collect the latest metrics from the registry and parse them into an easy to use map.
290+
The key includes the metric name and labels.
291+
292+
Example output:
293+
{
294+
"synapse_util_caches_cache_size": "0.0",
295+
"synapse_util_caches_cache_max_size{name="some_cache",server_name="hs1"}": "777.0",
296+
...
297+
}
298+
"""
299+
metric_map = {
300+
x.split(b" ")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")
301+
for x in filter(
302+
lambda x: len(x) > 0 and not x.startswith(b"#"),
303+
generate_latest(REGISTRY).split(b"\n"),
304+
)
305+
}
182306

183-
self.assertEqual(items["synapse_util_caches_cache_size"], "1.0")
184-
self.assertEqual(items["synapse_util_caches_cache_max_size"], "777.0")
307+
return metric_map

0 commit comments

Comments
 (0)