Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ba3bbbb
Add `INSTANCE_LABEL_NAME` to `register_cache(...)`
MadLittleMods Jun 27, 2025
749b7a4
Fill in `ExpiringCache`
MadLittleMods Jun 27, 2025
8e71fcd
Fill in `ResponseCache`
MadLittleMods Jun 27, 2025
61fc9ba
Fill in `StreamChangeCache`
MadLittleMods Jun 27, 2025
74610aa
Fill in `TTLCache`
MadLittleMods Jun 27, 2025
8dbca87
Fill in `LruCache` except for `@cached`
MadLittleMods Jun 27, 2025
1e57b57
Fix `LruCache` positional argument lint
MadLittleMods Jun 27, 2025
19c917c
Attempt `@cached` solution v1
MadLittleMods Jun 27, 2025
0453666
Fix missing `server_name` on `ExpiringCache` usage
MadLittleMods Jun 27, 2025
a17206f
Fix arguments in `DeferredCache` usage
MadLittleMods Jun 27, 2025
4fcfda0
Merge branch 'develop' into madlittlemods/per-hs-metrics-cache
MadLittleMods Jun 30, 2025
d10d862
Add changelog
MadLittleMods Jun 30, 2025
9895b3b
Fill in missing `LruCache` usage
MadLittleMods Jun 30, 2025
9eae037
Fix mypy complaining about unknown types
MadLittleMods Jun 30, 2025
ee91f6b
Better explain usage
MadLittleMods Jun 30, 2025
1917a0b
Fill in `server_name` attribute for `ApplicationService` (for `@cached`)
MadLittleMods Jun 30, 2025
64ed156
Fill in `server_name` attribute for `@cached`
MadLittleMods Jun 30, 2025
e943bb1
Fix more `ApplicationService` usage/mocks
MadLittleMods Jun 30, 2025
b3ecd5c
Fix `ApplicationService` `sender` usage in test
MadLittleMods Jun 30, 2025
f7e6f09
Fix `CacheMetric` splatting label objects as arguments
MadLittleMods Jun 30, 2025
9078076
Fix `ApplicationService` `sender` usage in test2
MadLittleMods Jun 30, 2025
42699c4
`instance` -> `server_name` label
MadLittleMods Jul 3, 2025
22cee6f
Add comment about why `self.server_name` necessary for `@cached`
MadLittleMods Jul 3, 2025
f5d9558
Merge branch 'develop' into madlittlemods/per-hs-metrics-cache
MadLittleMods Jul 3, 2025
2b13387
Merge branch 'develop' into madlittlemods/per-hs-metrics-cache
MadLittleMods Jul 4, 2025
71fbcf6
Merge branch 'develop' into madlittlemods/per-hs-metrics-cache
MadLittleMods Jul 9, 2025
6676f05
Merge branch 'develop' into madlittlemods/per-hs-metrics-cache
MadLittleMods Jul 15, 2025
0bb7eae
Move comment to match arg order
MadLittleMods Jul 15, 2025
b5f00c8
Remove Cargo.lock changes
MadLittleMods Jul 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions synapse/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,17 @@

HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")

INSTANCE_LABEL_NAME = "instance"
Comment thread
MadLittleMods marked this conversation as resolved.
Outdated
"""
The standard Prometheus label name used to identify which server instance the metrics
came from.
In the case of a Synapse homeserver, this should be set to the homeserver name
(`hs.hostname`).
Normally, this would be set automatically by the Prometheus server scraping the data but
since we support multiple instances of Synapse running in the same process and all
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

absolute nit, readability:

Suggested change
Normally, this would be set automatically by the Prometheus server scraping the data but
since we support multiple instances of Synapse running in the same process and all
Normally, this would be set automatically by the Prometheus server scraping the data. But
since we support multiple instances of Synapse running in the same process and all

Copy link
Copy Markdown
Contributor Author

@MadLittleMods MadLittleMods Jul 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is from an old version of the diff. This is the latest:

SERVER_NAME_LABEL = "server_name"
"""
The `server_name` label is used to identify the homeserver that the metrics correspond
to. Because we support multiple instances of Synapse running in the same process and all
metrics are in a single global `REGISTRY`, we need to manually label any metrics.
In the case of a Synapse homeserver, this should be set to the homeserver name
(`hs.hostname`).
We're purposely not using the `instance` label for this purpose as that should be "The
<host>:<port> part of the target's URL that was scraped.". Also: "In Prometheus
terms, an endpoint you can scrape is called an *instance*, usually corresponding to a
single process." (source: https://prometheus.io/docs/concepts/jobs_instances/)
"""

metrics are in a single global `REGISTRY`, we need to manually label any metrics.
"""


class _RegistryProxy:
@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions synapse/push/bulk_push_rule_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ def __init__(self, hs: "HomeServer"):
self._related_event_match_enabled = self.hs.config.experimental.msc3664_enabled

self.room_push_rule_cache_metrics = register_cache(
"cache",
"room_push_rule_cache",
cache_type="cache",
cache_name="room_push_rule_cache",
cache=[], # Meaningless size, as this isn't a cache that stores values,
resizable=False,
)
Expand Down
83 changes: 55 additions & 28 deletions synapse/util/caches/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from prometheus_client.core import Gauge

from synapse.config.cache import add_resizable_cache
from synapse.metrics import INSTANCE_LABEL_NAME
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The metrics being refactored to be homeserver scoped are in this file.

The rest of the changes are to support that change and supply the server_name to the instance label.

from synapse.util.metrics import DynamicCollectorRegistry

logger = logging.getLogger(__name__)
Expand All @@ -46,50 +47,65 @@
caches_by_name: Dict[str, Sized] = {}

cache_size = Gauge(
"synapse_util_caches_cache_size", "", ["name"], registry=CACHE_METRIC_REGISTRY
"synapse_util_caches_cache_size",
"",
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
cache_hits = Gauge(
"synapse_util_caches_cache_hits", "", ["name"], registry=CACHE_METRIC_REGISTRY
"synapse_util_caches_cache_hits",
"",
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
cache_evicted = Gauge(
"synapse_util_caches_cache_evicted_size",
"",
["name", "reason"],
labelnames=["name", "reason", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
cache_total = Gauge(
"synapse_util_caches_cache", "", ["name"], registry=CACHE_METRIC_REGISTRY
"synapse_util_caches_cache",
"",
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
cache_max_size = Gauge(
"synapse_util_caches_cache_max_size", "", ["name"], registry=CACHE_METRIC_REGISTRY
"synapse_util_caches_cache_max_size",
"",
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
cache_memory_usage = Gauge(
"synapse_util_caches_cache_size_bytes",
"Estimated memory usage of the caches",
["name"],
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)

response_cache_size = Gauge(
"synapse_util_caches_response_cache_size",
"",
["name"],
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
response_cache_hits = Gauge(
"synapse_util_caches_response_cache_hits",
"",
["name"],
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
response_cache_evicted = Gauge(
"synapse_util_caches_response_cache_evicted_size",
"",
["name", "reason"],
labelnames=["name", "reason", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)
response_cache_total = Gauge(
"synapse_util_caches_response_cache", "", ["name"], registry=CACHE_METRIC_REGISTRY
"synapse_util_caches_response_cache",
"",
labelnames=["name", INSTANCE_LABEL_NAME],
registry=CACHE_METRIC_REGISTRY,
)


Expand All @@ -103,12 +119,13 @@ class EvictionReason(Enum):
invalidation = auto()


@attr.s(slots=True, auto_attribs=True)
@attr.s(slots=True, auto_attribs=True, kw_only=True)
class CacheMetric:
_cache: Sized
_cache_type: str
_cache_name: str
_collect_callback: Optional[Callable]
_server_name: str

hits: int = 0
misses: int = 0
Expand Down Expand Up @@ -145,34 +162,34 @@ def describe(self) -> List[str]:

def collect(self) -> None:
try:
labels_base = {
"name": self._cache_name,
INSTANCE_LABEL_NAME: self._server_name,
}
if self._cache_type == "response_cache":
response_cache_size.labels(self._cache_name).set(len(self._cache))
response_cache_hits.labels(self._cache_name).set(self.hits)
response_cache_size.labels(**labels_base).set(len(self._cache))
response_cache_hits.labels(**labels_base).set(self.hits)
for reason in EvictionReason:
response_cache_evicted.labels(self._cache_name, reason.name).set(
self.eviction_size_by_reason[reason]
)
response_cache_total.labels(self._cache_name).set(
self.hits + self.misses
)
response_cache_evicted.labels(
{**labels_base, "reason": reason.name}
).set(self.eviction_size_by_reason[reason])
response_cache_total.labels(**labels_base).set(self.hits + self.misses)
else:
cache_size.labels(self._cache_name).set(len(self._cache))
cache_hits.labels(self._cache_name).set(self.hits)
cache_size.labels(**labels_base).set(len(self._cache))
cache_hits.labels(**labels_base).set(self.hits)
for reason in EvictionReason:
cache_evicted.labels(self._cache_name, reason.name).set(
cache_evicted.labels({**labels_base, "reason": reason.name}).set(
self.eviction_size_by_reason[reason]
)
cache_total.labels(self._cache_name).set(self.hits + self.misses)
cache_total.labels(**labels_base).set(self.hits + self.misses)
max_size = getattr(self._cache, "max_size", None)
if max_size:
cache_max_size.labels(self._cache_name).set(max_size)
cache_max_size.labels(**labels_base).set(max_size)

if TRACK_MEMORY_USAGE:
# self.memory_usage can be None if nothing has been inserted
# into the cache yet.
cache_memory_usage.labels(self._cache_name).set(
self.memory_usage or 0
)
cache_memory_usage.labels(**labels_base).set(self.memory_usage or 0)
if self._collect_callback:
self._collect_callback()
except Exception as e:
Expand All @@ -181,9 +198,11 @@ def collect(self) -> None:


def register_cache(
*,
cache_type: str,
cache_name: str,
cache: Sized,
server_name: str,
collect_callback: Optional[Callable] = None,
resizable: bool = True,
resize_callback: Optional[Callable] = None,
Expand All @@ -196,6 +215,8 @@ def register_cache(
cache_name: name of the cache
cache: cache itself, which must implement __len__(), and may optionally implement
a max_size property
server_name: server_name: The homeserver name that this cache is associated with
(used to label the metric) (`hs.hostname`).
collect_callback: If given, a function which is called during metric
collection to update additional metrics.
resizable: Whether this cache supports being resized, in which case either
Expand All @@ -210,7 +231,13 @@ def register_cache(
resize_callback = cache.set_cache_factor # type: ignore
add_resizable_cache(cache_name, resize_callback)

metric = CacheMetric(cache, cache_type, cache_name, collect_callback)
metric = CacheMetric(
cache=cache,
cache_type=cache_type,
cache_name=cache_name,
server_name=server_name,
collect_callback=collect_callback,
)
metric_name = "cache_%s_%s" % (cache_type, cache_name)
caches_by_name[cache_name] = cache
CACHE_METRIC_REGISTRY.register_hook(metric_name, metric.collect)
Expand Down
4 changes: 3 additions & 1 deletion synapse/util/caches/expiringcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def __init__(

self.iterable = iterable

self.metrics = register_cache("expiring", cache_name, self)
self.metrics = register_cache(
cache_type="expiring", cache_name=cache_name, cache=self
)

if not self._expiry_ms:
# Don't bother starting the loop if things never expire
Expand Down
6 changes: 3 additions & 3 deletions synapse/util/caches/lrucache.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,9 +459,9 @@ def __init__(

if cache_name is not None:
metrics: Optional[CacheMetric] = register_cache(
"lru_cache",
cache_name,
self,
cache_type="lru_cache",
cache_name=cache_name,
cache=self,
collect_callback=metrics_collection_callback,
)
else:
Expand Down
4 changes: 3 additions & 1 deletion synapse/util/caches/response_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def __init__(
self.timeout_sec = timeout_ms / 1000.0

self._name = name
self._metrics = register_cache("response_cache", name, self, resizable=False)
self._metrics = register_cache(
cache_type="response_cache", cache_name=name, cache=self, resizable=False
)
self._enable_logging = enable_logging

def size(self) -> int:
Expand Down
5 changes: 4 additions & 1 deletion synapse/util/caches/stream_change_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ def __init__(

self.name = name
self.metrics = caches.register_cache(
"cache", self.name, self._cache, resize_callback=self.set_cache_factor
cache_type="cache",
cache_name=self.name,
cache=self._cache,
resize_callback=self.set_cache_factor,
)

if prefilled_cache:
Expand Down
4 changes: 3 additions & 1 deletion synapse/util/caches/ttlcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def __init__(self, cache_name: str, timer: Callable[[], float] = time.time):

self._timer = timer

self._metrics = register_cache("ttl", cache_name, self, resizable=False)
self._metrics = register_cache(
cache_type="ttl", cache_name=cache_name, cache=self, resizable=False
)

def set(self, key: KT, value: VT, ttl: float) -> None:
"""Add/update an entry in the cache
Expand Down