Skip to content

Commit 0fc3d87

Browse files
committed
Fix high idle memory usage introduced in 2.9
2.9 imported torch into the process at startup (via the always-on audio analysis controller and the default-enabled Smart Fades provider), pushing idle RAM from ~300-500MB to ~1GB and causing OOM on small hardware. - Gate Smart Fades (>=6GB / 4 cores) and Sonic Analysis (>=8GB / 4 cores) on system resources, raising a non-retrying UnsupportedSystemError when unmet - Stop loading torch at idle: lazy-import it in the audio analysis controller and Smart Fades provider, and configure torch thread caps on first analysis - Auto-created default providers that do not meet requirements are removed and not retried, instead of lingering as a broken/retrying provider - Lazy-load the Sonic Similarity text encoder on first query - RAM-filter the buffer-size options (Balanced >=4GB, Maximum >=8GB)
1 parent 3dddcc9 commit 0fc3d87

12 files changed

Lines changed: 245 additions & 29 deletions

File tree

music_assistant/constants.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1107,7 +1107,9 @@ def create_sample_rates_config_entry(
11071107
("heos", True, lambda: True),
11081108
("wiim", True, lambda: True),
11091109
("party", False, lambda: True),
1110-
("smart_fades", False, lambda: (os.cpu_count() or 1) > 1),
1110+
# smart_fades gates on system requirements (RAM/CPU) in its own setup(); an
1111+
# under-spec host has the auto-created config removed again at load time.
1112+
("smart_fades", False, lambda: True),
11111113
("lastfm_recommendations", False, lambda: True),
11121114
}
11131115

music_assistant/controllers/streams/audio.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,10 @@
6767
MASS_LOGGER_NAME,
6868
VERBOSE_LOG_LEVEL,
6969
)
70-
from music_assistant.controllers.streams.audio_analysis import LOUDNESS_ANALYSIS_DOMAIN
70+
from music_assistant.controllers.streams.audio_analysis import (
71+
LOUDNESS_ANALYSIS_DOMAIN,
72+
SMART_FADES_ANALYSIS_DOMAIN,
73+
)
7174
from music_assistant.controllers.streams.audio_buffer import AudioBuffer
7275
from music_assistant.controllers.streams.constants import (
7376
CACHE_CATEGORY_RESOLVED_RADIO_URL,
@@ -2054,10 +2057,11 @@ async def get_queue_flow_stream(
20542057
if flow_player
20552058
else []
20562059
)
2057-
# smart crossfade requires a large buffer for beat analysis
2058-
if (
2059-
smart_fades_mode == SmartFadesMode.SMART_CROSSFADE
2060-
and self.mass.config.get_raw_core_config_value(
2060+
# smart crossfade needs the smart_fades analysis provider (for beat/key data) and a
2061+
# non-minimal buffer for beat analysis; fall back to standard crossfade otherwise.
2062+
if smart_fades_mode == SmartFadesMode.SMART_CROSSFADE and (
2063+
self.mass.get_provider(SMART_FADES_ANALYSIS_DOMAIN) is None
2064+
or self.mass.config.get_raw_core_config_value(
20612065
"streams", CONF_BUFFER_SIZE, CONF_BUFFER_SIZE_DEFAULT
20622066
)
20632067
== BufferSize.MINIMAL

music_assistant/controllers/streams/audio_analysis.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from math import inf
1313
from typing import TYPE_CHECKING, Any
1414

15-
import torch
1615
from music_assistant_models.audio_analysis import AudioAnalysisCoverage
1716
from music_assistant_models.background_task import TaskSchedule
1817
from music_assistant_models.enums import ContentType, MediaType, ProviderType, StreamType
@@ -137,10 +136,10 @@ def __init__(self, streams: StreamsController) -> None:
137136
self.logger = self.mass.logger.getChild("audio_analysis")
138137
self._active_sessions: dict[str, set[str]] = {}
139138
self._workers: dict[str, asyncio.Task[None]] = {}
139+
self._thread_caps_configured = False
140140

141141
def setup(self) -> None:
142-
"""Register the nightly background scan task and apply CPU caps."""
143-
self._configure_thread_caps()
142+
"""Register the nightly background scan task."""
144143
utc_hour, utc_minute = local_clock_time_to_utc(0, 0)
145144
self.mass.tasks.register_scheduled_task(
146145
task_id=BACKGROUND_SCAN_TASK_ID,
@@ -163,8 +162,15 @@ async def close(self) -> None:
163162
if workers:
164163
await asyncio.gather(*workers, return_exceptions=True)
165164

166-
def _configure_thread_caps(self) -> None:
167-
"""Cap PyTorch threading so Audio Analysis inference stays around a quarter of cpu_count."""
165+
def _ensure_thread_caps_configured(self) -> None:
166+
"""Cap PyTorch threading (once) so Audio Analysis inference stays around a quarter of cpu_count."""
167+
# Import torch lazily and only when analysis actually starts, so an idle server
168+
# with no audio analysis provider enabled never imports torch.
169+
if self._thread_caps_configured:
170+
return
171+
self._thread_caps_configured = True
172+
import torch # noqa: PLC0415
173+
168174
budget = self._aa_thread_budget()
169175
torch.set_num_threads(budget)
170176
with contextlib.suppress(RuntimeError):
@@ -955,6 +961,9 @@ async def _start_analysis_on_providers(
955961
providers: list[AudioAnalysisProvider],
956962
) -> set[str]:
957963
"""Call start_analysis on each provider, returning IDs of those that accepted."""
964+
# Apply torch thread caps now that analysis is actually starting (lazy, once).
965+
# This is the shared chokepoint for both the live and background-scan paths.
966+
self._ensure_thread_caps_configured()
958967
provider_ids: set[str] = set()
959968
for provider in providers:
960969
try:

music_assistant/controllers/streams/constants.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,27 @@ class BufferSize(StrEnum):
4141
CONF_BUFFER_SIZE: Final[str] = "buffer_size"
4242

4343

44+
def get_available_buffer_sizes() -> list[BufferSize]:
45+
"""
46+
Return the buffer-size presets allowed for this host's RAM.
47+
48+
Minimal is always available; Balanced requires >= 4GB and Maximum >= 8GB. When total
49+
memory is unknown (0.0, e.g. Windows) all presets are offered (fail open).
50+
"""
51+
if TOTAL_SYSTEM_MEMORY_GB == 0.0:
52+
return [BufferSize.MINIMAL, BufferSize.BALANCED, BufferSize.MAXIMUM]
53+
sizes = [BufferSize.MINIMAL]
54+
if TOTAL_SYSTEM_MEMORY_GB >= 4.0:
55+
sizes.append(BufferSize.BALANCED)
56+
if TOTAL_SYSTEM_MEMORY_GB >= 8.0:
57+
sizes.append(BufferSize.MAXIMUM)
58+
return sizes
59+
60+
4461
def _get_default_buffer_size() -> str:
4562
if TOTAL_SYSTEM_MEMORY_GB >= 8.0:
4663
return BufferSize.MAXIMUM
47-
if TOTAL_SYSTEM_MEMORY_GB > 4.0:
64+
if TOTAL_SYSTEM_MEMORY_GB >= 4.0:
4865
return BufferSize.BALANCED
4966
return BufferSize.MINIMAL
5067

music_assistant/controllers/streams/controller.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969
CONF_BUFFER_SIZE_DEFAULT,
7070
CONF_SMART_FADES_LOG_LEVEL,
7171
DEFAULT_PORT,
72-
BufferSize,
72+
get_available_buffer_sizes,
7373
)
7474
from music_assistant.helpers.audio import (
7575
calculate_content_length,
@@ -189,10 +189,11 @@ async def get_config_entries(
189189
"good balance for most systems.\n"
190190
"- **Maximum**: Large buffer, "
191191
"best performance for systems with plenty of memory.",
192+
# Only offer presets the host's RAM can sustain (Balanced >= 4GB,
193+
# Maximum >= 8GB); see get_available_buffer_sizes.
192194
options=[
193-
ConfigValueOption("Minimal", BufferSize.MINIMAL.value),
194-
ConfigValueOption("Balanced", BufferSize.BALANCED.value),
195-
ConfigValueOption("Maximum", BufferSize.MAXIMUM.value),
195+
ConfigValueOption(size.value.title(), size.value)
196+
for size in get_available_buffer_sizes()
196197
],
197198
required=False,
198199
category="playback",

music_assistant/helpers/util.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,52 @@ def get_total_system_memory() -> float:
130130
return 0.0
131131

132132

133+
class UnsupportedSystemError(SetupFailedError):
134+
"""
135+
Raised when the host does not meet a provider's minimum requirements.
136+
137+
Subclass of SetupFailedError so existing setup handling still applies, but it
138+
marks a permanent condition (RAM, CPU cores, CPU capability) that will not
139+
resolve at runtime, so the provider load must not be retried.
140+
"""
141+
142+
143+
def verify_system_meets_requirements(
144+
*,
145+
feature_name: str,
146+
min_memory_gb: float = 0.0,
147+
min_cpu_cores: int = 0,
148+
) -> None:
149+
"""
150+
Verify the host meets the minimum CPU/RAM requirements for a heavy provider.
151+
152+
:param feature_name: Human-readable provider name used in the error message.
153+
:param min_memory_gb: Minimum total system RAM in GB (0 disables the check).
154+
:param min_cpu_cores: Minimum CPU core count (0 disables the check).
155+
:raises UnsupportedSystemError: If the system does not meet the requirements.
156+
"""
157+
cpu_cores = os.process_cpu_count() or os.cpu_count() or 1
158+
if min_cpu_cores and cpu_cores < min_cpu_cores:
159+
raise UnsupportedSystemError(
160+
f"This system does not meet the minimal requirements for {feature_name}: "
161+
f"at least {min_cpu_cores} CPU cores are required ({cpu_cores} detected)."
162+
)
163+
total_memory_gb = get_total_system_memory()
164+
# get_total_system_memory() returns 0.0 when the platform cannot report memory
165+
# (e.g. Windows); treat that as unknown and fail open rather than block setup.
166+
if min_memory_gb and total_memory_gb and total_memory_gb < min_memory_gb:
167+
raise UnsupportedSystemError(
168+
f"This system does not meet the minimal requirements for {feature_name}: "
169+
f"at least {min_memory_gb:.0f}GB of RAM is required "
170+
f"({total_memory_gb:.1f}GB detected)."
171+
)
172+
173+
133174
def verify_cpu_supports_ml_inference() -> None:
134175
"""
135176
Verify the CPU can run on-device ML (torch) inference.
136177
137-
:raises SetupFailedError: If this is an x86 CPU without AVX2 support, which
178+
:raises UnsupportedSystemError: If this is an x86 CPU without AVX2 support, which
138179
torch's FBGEMM quantized backend requires.
139180
"""
140181
if platform.machine().lower() not in ("x86_64", "amd64", "i386", "i686", "x86"):
@@ -143,7 +184,7 @@ def verify_cpu_supports_ml_inference() -> None:
143184
import torch # noqa: PLC0415
144185

145186
if torch.backends.cpu.get_cpu_capability() in ("DEFAULT", "NO AVX"):
146-
raise SetupFailedError(
187+
raise UnsupportedSystemError(
147188
"On-device audio analysis requires a CPU with AVX2 support "
148189
"(Intel Haswell / AMD Zen or newer). This CPU does not support AVX2. "
149190
"If you are running in a virtual machine (e.g. Proxmox), changing the "

music_assistant/mass.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from music_assistant.helpers.images import get_icon_string
5050
from music_assistant.helpers.util import (
5151
TaskManager,
52+
UnsupportedSystemError,
5253
get_package_version,
5354
is_hass_supervisor,
5455
load_provider_module,
@@ -713,6 +714,7 @@ async def load_provider(
713714
self,
714715
instance_id: str,
715716
allow_retry: bool = False,
717+
remove_if_unsupported: bool = False,
716718
) -> None:
717719
"""Try to load a provider and catch errors."""
718720
try:
@@ -732,6 +734,27 @@ async def load_provider(
732734

733735
try:
734736
await self.load_provider_config(prov_conf)
737+
except UnsupportedSystemError as exc:
738+
# The host does not meet this provider's hardware requirements. This is a
739+
# permanent condition, so we never retry. For a provider that was just
740+
# auto-set-up as a default, drop the config again so it does not linger as a
741+
# broken provider (it stays marked done so it is not auto-created again).
742+
if remove_if_unsupported:
743+
LOGGER.info(
744+
"Not enabling default provider %s: %s",
745+
prov_conf.name or prov_conf.instance_id,
746+
exc,
747+
)
748+
await self.config.remove_provider_config(instance_id)
749+
return
750+
prov_conf.last_error = str(exc)
751+
self.config.set(f"{CONF_PROVIDERS}/{instance_id}/last_error", str(exc))
752+
LOGGER.warning(
753+
"Provider(instance) %s can not run on this system: %s",
754+
prov_conf.name or prov_conf.instance_id,
755+
exc,
756+
)
757+
return
735758
except Exception as exc:
736759
# if loading failed, we store the error in the config object
737760
# so we can show something useful to the user
@@ -895,6 +918,7 @@ async def _load_providers(self) -> None:
895918
self.config.set_default(CONF_DEFAULT_PROVIDERS_SETUP, set())
896919
default_providers_setup = set(self.config.get(CONF_DEFAULT_PROVIDERS_SETUP))
897920
changes_made = False
921+
newly_created_defaults: set[str] = set()
898922
for default_provider, require_mdns, precondition in DEFAULT_PROVIDERS:
899923
if default_provider in default_providers_setup:
900924
# already processed/setup before, skip
@@ -915,6 +939,7 @@ async def _load_providers(self) -> None:
915939
continue
916940
await self.config.create_builtin_provider_config(manifest.domain)
917941
changes_made = True
942+
newly_created_defaults.add(manifest.domain)
918943
# TEMP: migration - to be removed after 2.8 release
919944
# enable all existing players of the default providers if they are not already enabled
920945
# due to the linked protocol feature we introduced
@@ -944,7 +969,15 @@ async def _load_providers(self) -> None:
944969
for prov_conf in other_configs:
945970
# Use a task so we can load multiple providers at once.
946971
# If a provider fails, that will not block the loading of other providers.
947-
tg.create_task(self.load_provider(prov_conf.instance_id, allow_retry=True))
972+
# For providers just auto-set-up as a default, drop the config again if the
973+
# host does not meet their requirements (rather than retry a broken provider).
974+
tg.create_task(
975+
self.load_provider(
976+
prov_conf.instance_id,
977+
allow_retry=True,
978+
remove_if_unsupported=prov_conf.domain in newly_created_defaults,
979+
)
980+
)
948981

949982
async def _load_provider(self, conf: ProviderConfig) -> None:
950983
"""Load (or reload) a provider."""

music_assistant/providers/smart_fades/__init__.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from music_assistant_models.config_entries import ConfigEntry, ConfigValueType
99
from music_assistant_models.enums import ConfigEntryType
1010

11-
from .provider import SmartFadesProvider
11+
from music_assistant.helpers.util import verify_system_meets_requirements
1212

1313
if TYPE_CHECKING:
1414
from music_assistant_models.config_entries import ProviderConfig
@@ -17,15 +17,28 @@
1717

1818
from music_assistant.mass import MusicAssistant
1919

20+
from .provider import SmartFadesProvider
21+
2022
SUPPORTED_FEATURES: set[ProviderFeature] = set()
2123

24+
# Smart Fades runs on-device ML (torch) inference; gate it to capable hardware.
25+
MIN_RAM_GB = 6.0
26+
MIN_CPU_CORES = 4
27+
2228

2329
async def setup(
2430
mass: MusicAssistant,
2531
manifest: ProviderManifest,
2632
config: ProviderConfig,
2733
) -> SmartFadesProvider:
2834
"""Set up the Smart Fades provider."""
35+
# Gate before importing the provider module so the heavy torch/beat_this stack is
36+
# never imported on a host that does not meet the minimal requirements.
37+
verify_system_meets_requirements(
38+
feature_name="Smart Fades", min_memory_gb=MIN_RAM_GB, min_cpu_cores=MIN_CPU_CORES
39+
)
40+
from .provider import SmartFadesProvider # noqa: PLC0415
41+
2942
return SmartFadesProvider(mass, manifest, config, SUPPORTED_FEATURES)
3043

3144

music_assistant/providers/sonic_analysis/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
from music_assistant_models.config_entries import ConfigEntry, ConfigValueOption
1414
from music_assistant_models.enums import ConfigEntryType, ContentType
1515

16-
from music_assistant.helpers.util import verify_cpu_supports_ml_inference
16+
from music_assistant.helpers.util import (
17+
verify_cpu_supports_ml_inference,
18+
verify_system_meets_requirements,
19+
)
1720
from music_assistant.models.audio_analysis import AudioAnalysisData
1821
from music_assistant.models.audio_analysis_provider import (
1922
AnalysisSessionData,
@@ -66,6 +69,10 @@
6669

6770
CONF_CLAP_SAMPLING: str = "clap_sampling"
6871

72+
# Sonic Analysis runs on-device CLAP inference; gate it to capable hardware.
73+
MIN_RAM_GB: float = 8.0
74+
MIN_CPU_CORES: int = 4
75+
6976

7077
@dataclass
7178
class SonicSessionData(AnalysisSessionData):
@@ -320,6 +327,9 @@ async def handle_async_init(self) -> None:
320327
available=False, which the AudioAnalysisController already honors when
321328
scheduling work.
322329
"""
330+
verify_system_meets_requirements(
331+
feature_name="Sonic Analysis", min_memory_gb=MIN_RAM_GB, min_cpu_cores=MIN_CPU_CORES
332+
)
323333
verify_cpu_supports_ml_inference()
324334
(
325335
self._clap_model,

music_assistant/providers/sonic_similarity/provider.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def __init__(
113113
# CLAP text encoder — lazy: stays None until the first text_search call.
114114
self._text_encoder: Any = None
115115
self._text_encoder_lock = asyncio.Lock()
116+
self._text_encoder_warm_started = False
116117
# Per-label last error from fire-and-forget rebuild tasks.
117118
self._last_rebuild_error: dict[str, str] = {}
118119
self._last_seen_row_count: int = 0
@@ -218,12 +219,9 @@ async def loaded_in_mass(self) -> None:
218219
"sonic_similarity/text_search", self._handle_text_search
219220
)
220221
)
221-
# Warm in background so the timeout-less global SEARCH dispatcher never blocks
222-
# on the ~500MB GPT2 download; search() short-circuits until the encoder is set.
223-
self.mass.create_task(
224-
self._get_text_encoder, task_id="sonic_similarity_text_encoder_warm"
225-
)
226-
self.logger.info("Text search ready (encoder warming in background)")
222+
# The ~500MB GPT2 text encoder loads lazily on the first query (see search()
223+
# and _handle_text_search), not at plugin start.
224+
self.logger.info("Text search enabled (encoder loads on first query)")
227225

228226
self.mass.tasks.register_scheduled_task(
229227
task_id=PERIODIC_REFRESH_TASK_ID,
@@ -819,9 +817,14 @@ async def search(
819817
return SearchResults()
820818
if self._clap_index is None or len(self._clap_index) == 0:
821819
return SearchResults()
822-
# Only serve once the encoder is warm; never hold up the global search
823-
# gather waiting on the lazy load (background-scheduled in loaded_in_mass).
820+
# Never hold up the timeout-less global SEARCH gather on the ~500MB encoder load:
821+
# kick off a one-time background warm on the first query, and short-circuit until ready.
824822
if self._text_encoder is None:
823+
if not self._text_encoder_warm_started:
824+
self._text_encoder_warm_started = True
825+
self.mass.create_task(
826+
self._get_text_encoder, task_id="sonic_similarity_text_encoder_warm"
827+
)
825828
return SearchResults()
826829
emb_np = await self._embed_text_query(search_query)
827830
if emb_np is None:

0 commit comments

Comments
 (0)