From e4dbc0d4948a16fa86548ce6018eba8ba89b27b3 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 22 Jul 2025 18:27:56 -0500 Subject: [PATCH 01/12] Refactor `GaugeBucketCollector` metrics to be homeserver-scoped Part of https://github.com/element-hq/synapse/issues/18592 --- synapse/metrics/__init__.py | 114 ++++++++++++++++++++-- synapse/storage/databases/main/metrics.py | 18 ++-- 2 files changed, 117 insertions(+), 15 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index de750a5de2d..6b7c290b99f 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -33,6 +33,7 @@ Iterable, Mapping, Optional, + Sequence, Set, Tuple, Type, @@ -53,8 +54,9 @@ ) from prometheus_client.core import ( REGISTRY, - GaugeHistogramMetricFamily, GaugeMetricFamily, + Timestamp, + Sample, ) from twisted.python.threadpool import ThreadPool @@ -342,6 +344,85 @@ def _register_with_collector(self) -> None: all_gauges[self.name] = self +class GaugeHistogramMetricFamilyWithLabels(Metric): + """ + Custom version of `GaugeHistogramMetricFamily` from `prometheus_client` that allows + specifying labels and label values. + + A single gauge histogram and its samples. + + For use by custom collectors. + """ + + def __init__( + self, + *, + name: str, + documentation: str, + buckets: Optional[Sequence[Tuple[str, float]]] = None, + gsum_value: Optional[float] = None, + labelnames: StrSequence = (), + labelvalues: StrSequence = (), + unit: str = "", + ): + Metric.__init__(self, name, documentation, "gaugehistogram", unit) + + # Sanity check the number of label values matches the number of label names. + if len(labelvalues) != len(labelnames): + raise ValueError("Incorrect label count") + + self._labelnames = tuple(labelnames) + + # Create a gauge for each bucket. + if buckets is not None: + self.add_metric( + labelvalues=labelvalues, buckets=buckets, gsum_value=gsum_value + ) + + def add_metric( + self, + labelvalues: StrSequence, + buckets: Sequence[Tuple[str, float]], + gsum_value: Optional[float], + timestamp: Optional[Union[float, Timestamp]] = None, + ) -> None: + """Add a metric to the metric family. + + Args: + labelvalues: A list of label values + buckets: A list of pairs of bucket names and values. + The buckets must be sorted, and +Inf present. + gsum_value: The sum value of the metric. + """ + for bucket, value in buckets: + self.samples.append( + Sample( + self.name + "_bucket", + dict(list(zip(self._labelnames, labelvalues)) + [("le", bucket)]), + value, + timestamp, + ) + ) + # +Inf is last and provides the count value. + self.samples.extend( + [ + Sample( + self.name + "_gcount", + dict(zip(self._labelnames, labelvalues)), + buckets[-1][1], + timestamp, + ), + # TODO: Handle None gsum_value correctly. Currently a None will fail exposition but is allowed here. + Sample( + self.name + "_gsum", + dict(zip(self._labelnames, labelvalues)), + gsum_value, + timestamp, + ), # type: ignore + ] + ) + + class GaugeBucketCollector(Collector): """Like a Histogram, but the buckets are Gauges which are updated atomically. @@ -354,14 +435,17 @@ class GaugeBucketCollector(Collector): __slots__ = ( "_name", "_documentation", + "_labelnames", "_bucket_bounds", "_metric", ) def __init__( self, + *, name: str, documentation: str, + labelnames: Optional[StrSequence], buckets: Iterable[float], registry: CollectorRegistry = REGISTRY, ): @@ -375,6 +459,7 @@ def __init__( """ self._name = name self._documentation = documentation + self._labelnames = labelnames # the tops of the buckets self._bucket_bounds = [float(b) for b in buckets] @@ -386,7 +471,7 @@ def __init__( # We initially set this to None. We won't report metrics until # this has been initialised after a successful data update - self._metric: Optional[GaugeHistogramMetricFamily] = None + self._metric: Optional[GaugeHistogramMetricFamilyWithLabels] = None registry.register(self) @@ -395,15 +480,26 @@ def collect(self) -> Iterable[Metric]: if self._metric is not None: yield self._metric - def update_data(self, values: Iterable[float]) -> None: + def update_data(self, values: Iterable[float], labels: StrSequence = ()) -> None: """Update the data to be reported by the metric The existing data is cleared, and each measurement in the input is assigned to the relevant bucket. + + Args: + values + labels """ - self._metric = self._values_to_metric(values) + self._metric = self._values_to_metric(values, labels) - def _values_to_metric(self, values: Iterable[float]) -> GaugeHistogramMetricFamily: + def _values_to_metric( + self, values: Iterable[float], labels: StrSequence = () + ) -> GaugeHistogramMetricFamilyWithLabels: + """ + Args: + values + labels + """ total = 0.0 bucket_values = [0 for _ in self._bucket_bounds] @@ -421,9 +517,11 @@ def _values_to_metric(self, values: Iterable[float]) -> GaugeHistogramMetricFami # that bucket or below. accumulated_values = itertools.accumulate(bucket_values) - return GaugeHistogramMetricFamily( - self._name, - self._documentation, + return GaugeHistogramMetricFamilyWithLabels( + name=self._name, + documentation=self._documentation, + labelnames=self._labelnames, + labelvalues=labels, buckets=list( zip((str(b) for b in self._bucket_bounds), accumulated_values) ), diff --git a/synapse/storage/databases/main/metrics.py b/synapse/storage/databases/main/metrics.py index 9ce1100b5ce..a3467bff3dc 100644 --- a/synapse/storage/databases/main/metrics.py +++ b/synapse/storage/databases/main/metrics.py @@ -23,7 +23,7 @@ import time from typing import TYPE_CHECKING, Dict, List, Tuple, cast -from synapse.metrics import GaugeBucketCollector +from synapse.metrics import SERVER_NAME_LABEL, GaugeBucketCollector from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.storage._base import SQLBaseStore from synapse.storage.database import ( @@ -42,9 +42,10 @@ # Collect metrics on the number of forward extremities that exist. _extremities_collecter = GaugeBucketCollector( - "synapse_forward_extremities", - "Number of rooms on the server with the given number of forward extremities" + name="synapse_forward_extremities", + documentation="Number of rooms on the server with the given number of forward extremities" " or fewer", + labelnames=[SERVER_NAME_LABEL], buckets=[1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500], ) @@ -54,9 +55,10 @@ # we could remove from state resolution by reducing the graph to a single # forward extremity. _excess_state_events_collecter = GaugeBucketCollector( - "synapse_excess_extremity_events", - "Number of rooms on the server with the given number of excess extremity " + name="synapse_excess_extremity_events", + documentation="Number of rooms on the server with the given number of excess extremity " "events, or fewer", + labelnames=[SERVER_NAME_LABEL], buckets=[0] + [1 << n for n in range(12)], ) @@ -100,10 +102,12 @@ def fetch(txn: LoggingTransaction) -> List[Tuple[int, int]]: res = await self.db_pool.runInteraction("read_forward_extremities", fetch) - _extremities_collecter.update_data(x[0] for x in res) + _extremities_collecter.update_data( + values=(x[0] for x in res), labels=(self.server_name,) + ) _excess_state_events_collecter.update_data( - (x[0] - 1) * x[1] for x in res if x[1] + values=((x[0] - 1) * x[1] for x in res if x[1]), labels=(self.server_name,) ) async def count_daily_e2ee_messages(self) -> int: From 551a0104ff80c0760b1fd77a449df2b684dee6b5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 22 Jul 2025 19:14:44 -0500 Subject: [PATCH 02/12] Add changelog --- changelog.d/18715.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/18715.misc diff --git a/changelog.d/18715.misc b/changelog.d/18715.misc new file mode 100644 index 00000000000..afb85ef0145 --- /dev/null +++ b/changelog.d/18715.misc @@ -0,0 +1 @@ +Refactor `GaugeBucketCollector` metrics to be homeserver-scoped. From e57678ba8b9b2195bc044539ddc52ed78f62962c Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 22 Jul 2025 19:16:09 -0500 Subject: [PATCH 03/12] Fix lints --- synapse/metrics/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 6b7c290b99f..b741b5b2911 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -55,8 +55,8 @@ from prometheus_client.core import ( REGISTRY, GaugeMetricFamily, - Timestamp, Sample, + Timestamp, ) from twisted.python.threadpool import ThreadPool From f5a008ff2655f1fde678a58818c8f0e5c01c65ad Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 22 Jul 2025 19:20:32 -0500 Subject: [PATCH 04/12] Fix more lints --- synapse/metrics/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index b741b5b2911..99904a6e962 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -359,8 +359,8 @@ def __init__( *, name: str, documentation: str, + gsum_value: float, buckets: Optional[Sequence[Tuple[str, float]]] = None, - gsum_value: Optional[float] = None, labelnames: StrSequence = (), labelvalues: StrSequence = (), unit: str = "", @@ -383,7 +383,7 @@ def add_metric( self, labelvalues: StrSequence, buckets: Sequence[Tuple[str, float]], - gsum_value: Optional[float], + gsum_value: float, timestamp: Optional[Union[float, Timestamp]] = None, ) -> None: """Add a metric to the metric family. @@ -412,13 +412,12 @@ def add_metric( buckets[-1][1], timestamp, ), - # TODO: Handle None gsum_value correctly. Currently a None will fail exposition but is allowed here. Sample( self.name + "_gsum", dict(zip(self._labelnames, labelvalues)), gsum_value, timestamp, - ), # type: ignore + ), ] ) @@ -459,7 +458,7 @@ def __init__( """ self._name = name self._documentation = documentation - self._labelnames = labelnames + self._labelnames = labelnames if labelnames else () # the tops of the buckets self._bucket_bounds = [float(b) for b in buckets] From 5dd08ec5d11c6b1904a9d342438ca7ce7f8498ec Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 22 Jul 2025 22:33:57 -0500 Subject: [PATCH 05/12] Move test to match source file --- .../{test_event_metrics.py => databases/main/test_metrics.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/storage/{test_event_metrics.py => databases/main/test_metrics.py} (100%) diff --git a/tests/storage/test_event_metrics.py b/tests/storage/databases/main/test_metrics.py similarity index 100% rename from tests/storage/test_event_metrics.py rename to tests/storage/databases/main/test_metrics.py From d4c1a4819655883e8e91871dd6e485c5824c63b6 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 22 Jul 2025 22:35:27 -0500 Subject: [PATCH 06/12] Add server_name to metrics output in tests --- tests/storage/databases/main/test_metrics.py | 30 ++++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/storage/databases/main/test_metrics.py b/tests/storage/databases/main/test_metrics.py index fc6e02545fc..be59e1b67e2 100644 --- a/tests/storage/databases/main/test_metrics.py +++ b/tests/storage/databases/main/test_metrics.py @@ -65,24 +65,24 @@ def test_exposed_to_prometheus(self) -> None: ) expected = [ - b'synapse_forward_extremities_bucket{le="1.0"} 0.0', - b'synapse_forward_extremities_bucket{le="2.0"} 2.0', - b'synapse_forward_extremities_bucket{le="3.0"} 2.0', - b'synapse_forward_extremities_bucket{le="5.0"} 2.0', - b'synapse_forward_extremities_bucket{le="7.0"} 3.0', - b'synapse_forward_extremities_bucket{le="10.0"} 3.0', - b'synapse_forward_extremities_bucket{le="15.0"} 3.0', - b'synapse_forward_extremities_bucket{le="20.0"} 3.0', - b'synapse_forward_extremities_bucket{le="50.0"} 3.0', - b'synapse_forward_extremities_bucket{le="100.0"} 3.0', - b'synapse_forward_extremities_bucket{le="200.0"} 3.0', - b'synapse_forward_extremities_bucket{le="500.0"} 3.0', + b'synapse_forward_extremities_bucket{le="1.0",server_name="test"} 0.0', + b'synapse_forward_extremities_bucket{le="2.0",server_name="test"} 2.0', + b'synapse_forward_extremities_bucket{le="3.0",server_name="test"} 2.0', + b'synapse_forward_extremities_bucket{le="5.0",server_name="test"} 2.0', + b'synapse_forward_extremities_bucket{le="7.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="10.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="15.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="20.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="50.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="100.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="200.0",server_name="test"} 3.0', + b'synapse_forward_extremities_bucket{le="500.0",server_name="test"} 3.0', # per https://docs.google.com/document/d/1KwV0mAXwwbvvifBvDKH_LU1YjyXE_wxCkHNoCGq1GX0/edit#heading=h.wghdjzzh72j9, # "inf" is valid: "this includes variants such as inf" - b'synapse_forward_extremities_bucket{le="inf"} 3.0', + b'synapse_forward_extremities_bucket{le="inf",server_name="test"} 3.0', b"# TYPE synapse_forward_extremities_gcount gauge", - b"synapse_forward_extremities_gcount 3.0", + b'synapse_forward_extremities_gcount{server_name="test"} 3.0', b"# TYPE synapse_forward_extremities_gsum gauge", - b"synapse_forward_extremities_gsum 10.0", + b'synapse_forward_extremities_gsum{server_name="test"} 10.0', ] self.assertEqual(items, expected) From cdfb7379455c144027ebeede67ad4cdf6d756964 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 25 Jul 2025 15:11:54 -0500 Subject: [PATCH 07/12] Inherit from `GaugeHistogramMetricFamily` See https://github.com/element-hq/synapse/pull/18715#discussion_r2229723681 --- synapse/metrics/__init__.py | 51 +++---------------------------------- 1 file changed, 3 insertions(+), 48 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index e53e39b98c4..ab408672b00 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -54,9 +54,8 @@ ) from prometheus_client.core import ( REGISTRY, + GaugeHistogramMetricFamily, GaugeMetricFamily, - Sample, - Timestamp, ) from twisted.python.threadpool import ThreadPool @@ -345,7 +344,7 @@ def _register_with_collector(self) -> None: all_gauges[self.name] = self -class GaugeHistogramMetricFamilyWithLabels(Metric): +class GaugeHistogramMetricFamilyWithLabels(GaugeHistogramMetricFamily): """ Custom version of `GaugeHistogramMetricFamily` from `prometheus_client` that allows specifying labels and label values. @@ -376,51 +375,7 @@ def __init__( # Create a gauge for each bucket. if buckets is not None: - self.add_metric( - labelvalues=labelvalues, buckets=buckets, gsum_value=gsum_value - ) - - def add_metric( - self, - labelvalues: StrSequence, - buckets: Sequence[Tuple[str, float]], - gsum_value: float, - timestamp: Optional[Union[float, Timestamp]] = None, - ) -> None: - """Add a metric to the metric family. - - Args: - labelvalues: A list of label values - buckets: A list of pairs of bucket names and values. - The buckets must be sorted, and +Inf present. - gsum_value: The sum value of the metric. - """ - for bucket, value in buckets: - self.samples.append( - Sample( - self.name + "_bucket", - dict(list(zip(self._labelnames, labelvalues)) + [("le", bucket)]), - value, - timestamp, - ) - ) - # +Inf is last and provides the count value. - self.samples.extend( - [ - Sample( - self.name + "_gcount", - dict(zip(self._labelnames, labelvalues)), - buckets[-1][1], - timestamp, - ), - Sample( - self.name + "_gsum", - dict(zip(self._labelnames, labelvalues)), - gsum_value, - timestamp, - ), - ] - ) + self.add_metric(labels=labelvalues, buckets=buckets, gsum_value=gsum_value) class GaugeBucketCollector(Collector): From 202ef03af7fb720f928d53cb6c21c4f76d537c3e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 25 Jul 2025 15:21:59 -0500 Subject: [PATCH 08/12] Fix grammar See https://github.com/element-hq/synapse/pull/18725#discussion_r2229692212 --- scripts-dev/mypy_synapse_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 2ccac97103b..8597d6d5113 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -98,7 +98,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy ensures metrics are correctly separated by homeserver. There are also some metrics that apply at the process level, such as CPU usage, - Python garbage collection, Twisted reactor tick time which shouldn't have the + Python garbage collection, and Twisted reactor tick time, which shouldn't have the `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. """ From 26eef52f854ba9007c4461f702ea93bddf61f6f8 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Fri, 25 Jul 2025 15:22:31 -0500 Subject: [PATCH 09/12] Fix duplicate word typo See https://github.com/element-hq/synapse/pull/18725#discussion_r2229706490 --- scripts-dev/mypy_synapse_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts-dev/mypy_synapse_plugin.py b/scripts-dev/mypy_synapse_plugin.py index 8597d6d5113..7c674eef1b4 100644 --- a/scripts-dev/mypy_synapse_plugin.py +++ b/scripts-dev/mypy_synapse_plugin.py @@ -99,7 +99,7 @@ def check_prometheus_metric_instantiation(ctx: FunctionSigContext) -> CallableTy There are also some metrics that apply at the process level, such as CPU usage, Python garbage collection, and Twisted reactor tick time, which shouldn't have the - `SERVER_NAME_LABEL`. In those cases, use use a type ignore comment to disable the + `SERVER_NAME_LABEL`. In those cases, use a type ignore comment to disable the check, e.g. `# type: ignore[missing-server-name-label]`. """ # The true signature, this isn't being modified so this is what will be returned. From cb7f9bbc526a5663be5b719e8e438063923d89e5 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 29 Jul 2025 10:28:00 -0500 Subject: [PATCH 10/12] Use better stable API for setting label names See https://github.com/element-hq/synapse/pull/18715#discussion_r2229723681 --- synapse/metrics/__init__.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index ab408672b00..6e87c1685b1 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -371,7 +371,18 @@ def __init__( if len(labelvalues) != len(labelnames): raise ValueError("Incorrect label count") - self._labelnames = tuple(labelnames) + # Call the super to set the labelnames. We use this stable API instead of + # setting the internal `_labelnames` field directly. + super().__init__( + name=name, + documentation=documentation, + labels=labelnames, + # Since `GaugeHistogramMetricFamily` doesn't support supplying `labels` and + # `buckets` at the same time (artificial limitation), we will just set these + # as `None` and set up the buckets ourselves just below. + buckets=None, + gsum_value=None, + ) # Create a gauge for each bucket. if buckets is not None: From ba0352541a71a172c0bee364c735a40ec69a881e Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 29 Jul 2025 10:34:01 -0500 Subject: [PATCH 11/12] Further simplify --- synapse/metrics/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 6e87c1685b1..3d4dfb30075 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -365,14 +365,12 @@ def __init__( labelvalues: StrSequence = (), unit: str = "", ): - Metric.__init__(self, name, documentation, "gaugehistogram", unit) - # Sanity check the number of label values matches the number of label names. if len(labelvalues) != len(labelnames): raise ValueError("Incorrect label count") - # Call the super to set the labelnames. We use this stable API instead of - # setting the internal `_labelnames` field directly. + # Call the super to validate and set the labelnames. We use this stable API + # instead of setting the internal `_labelnames` field directly. super().__init__( name=name, documentation=documentation, From 94f196072818f4ec81284c1bef5a4d335160a5db Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 29 Jul 2025 10:34:48 -0500 Subject: [PATCH 12/12] Better error --- synapse/metrics/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index 3d4dfb30075..721c3c6d536 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -367,7 +367,9 @@ def __init__( ): # Sanity check the number of label values matches the number of label names. if len(labelvalues) != len(labelnames): - raise ValueError("Incorrect label count") + raise ValueError( + "The number of label values must match the number of label names" + ) # Call the super to validate and set the labelnames. We use this stable API # instead of setting the internal `_labelnames` field directly.