Bound audio-analysis CPU usage and silence NNPACK spam on ARM (#4257)

marcelveldt · web-flow · commit b2bfac8671d5 · 2026-06-16T12:18:32.000+02:00
diff --git a/music_assistant/controllers/streams/audio_analysis.py b/music_assistant/controllers/streams/audio_analysis.py
@@ -28,6 +28,7 @@
 from music_assistant.helpers.api import api_command
 from music_assistant.helpers.datetime import local_clock_time_to_utc
 from music_assistant.helpers.json import json_dumps, json_loads
+from music_assistant.helpers.util import is_arm
 from music_assistant.models.audio_analysis import AudioAnalysisData
 from music_assistant.models.audio_analysis_provider import AudioAnalysisProvider
 from music_assistant.models.music_provider import MusicProvider
@@ -136,7 +137,10 @@ def __init__(self, streams: StreamsController) -> None:
         self.logger = self.mass.logger.getChild("audio_analysis")
         self._active_sessions: dict[str, set[str]] = {}
         self._workers: dict[str, asyncio.Task[None]] = {}
-        self._thread_caps_configured = False
+        self._inference_runtime_configured = False
+        # Kept alive to persist the process-wide native BLAS thread cap (set in
+        # ensure_inference_runtime_configured); never used as a context manager.
+        self._blas_limiter: object | None = None
 
     def setup(self) -> None:
         """Register the nightly background scan task."""
@@ -162,32 +166,49 @@ async def close(self) -> None:
         if workers:
             await asyncio.gather(*workers, return_exceptions=True)
 
-    def ensure_thread_caps_configured(self) -> None:
+    def ensure_inference_runtime_configured(self) -> None:
         """
-        Cap PyTorch threading for analysis inference (process-wide, applied once).
+        Configure the on-device inference runtime for analysis (process-wide, applied once).
 
         Torch-backed analysis providers call this at the start of their handle_async_init,
         before loading their models.
         """
-        # Lazy torch import: only torch-backed providers call this, so a host running no
-        # such provider never imports torch. Running before the first model load also lets
-        # set_num_interop_threads take effect (it can only be set before the first torch op).
-        if self._thread_caps_configured:
+        if self._inference_runtime_configured:
             return
+        # Lazy imports: only torch-backed providers call this, so a host running no such
+        # provider never imports torch/threadpoolctl. Running before the first model load
+        # also lets set_num_interop_threads take effect (only settable before the first op).
+        import threadpoolctl  # noqa: PLC0415
         import torch  # noqa: PLC0415
 
         budget = self._aa_thread_budget()
         torch.set_num_threads(budget)
         with contextlib.suppress(RuntimeError):
             # set_num_interop_threads can only be called before the first torch op
             torch.set_num_interop_threads(1)
+        # torch.set_num_threads only governs torch's own ops. The per-block librosa/numpy
+        # feature extraction runs through the native BLAS pool (OpenBLAS), which otherwise
+        # spawns a thread per core per worker and, across concurrent sessions, saturates
+        # every core and starves playback. Cap it to the same budget; the limiter is kept
+        # alive on the controller so the cap persists for the process.
+        self._blas_limiter = threadpoolctl.threadpool_limits(limits=budget, user_api="blas")
+        arm = is_arm()
+        if arm:
+            # NNPACK frequently fails to initialize on ARM SBCs (e.g. Raspberry Pi); torch
+            # then re-logs "Could not initialize NNPACK" to stderr on every conv op. The fp32
+            # conv fallback is used on those hosts regardless, so disabling it only removes
+            # the log spam.
+            with contextlib.suppress(Exception):
+                torch.backends.nnpack.set_flags(False)  # type: ignore[no-untyped-call]
         self.logger.info(
-            "AudioAnalysis thread caps: torch intra=%d, torch interop=%d",
+            "AudioAnalysis runtime: torch intra=%d interop=%d, blas<=%d, nnpack=%s",
             torch.get_num_threads(),
             torch.get_num_interop_threads(),
+            budget,
+            "off" if arm else "on",
         )
         # Only mark done once configuration actually succeeded, so a failure retries.
-        self._thread_caps_configured = True
+        self._inference_runtime_configured = True
 
     @property
     def providers(self) -> list[AudioAnalysisProvider]:
diff --git a/music_assistant/helpers/util.py b/music_assistant/helpers/util.py
@@ -130,6 +130,11 @@ def get_total_system_memory() -> float:
         return 0.0
 
 
+def is_arm() -> bool:
+    """Return whether the host CPU is ARM-based (32- or 64-bit)."""
+    return platform.machine().lower() in ("arm64", "aarch64", "armv8l", "armv7l")
+
+
 def verify_system_meets_requirements(
     *,
     feature_name: str,
diff --git a/music_assistant/providers/smart_fades/manifest.json b/music_assistant/providers/smart_fades/manifest.json
@@ -4,7 +4,7 @@
   "name": "Smart Fades",
   "description": "Smart fades analyzes beat and downbeat detection, energy and musical key for smart crossfades.",
   "codeowners": ["@music-assistant"],
-  "requirements": ["beat-this==1.1.0", "nnAudio==0.3.3"],
+  "requirements": ["beat-this==1.1.0", "nnAudio==0.3.3", "threadpoolctl==3.6.0"],
   "credits": ["[Beat This!](https://github.com/CPJKU/beat_this)", "[skey](https://github.com/deezer/skey)"],
   "documentation": "https://music-assistant.io/audio-analysis/smart-fades/",
   "multi_instance": false,
diff --git a/music_assistant/providers/smart_fades/provider.py b/music_assistant/providers/smart_fades/provider.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import asyncio
-import platform
 import time
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
@@ -16,6 +15,7 @@
 from torchaudio.transforms import SpectralCentroid
 
 from music_assistant.constants import VERBOSE_LOG_LEVEL
+from music_assistant.helpers.util import is_arm
 from music_assistant.models.audio_analysis import AudioAnalysisData
 from music_assistant.models.audio_analysis_provider import AudioAnalysisProvider
 
@@ -73,8 +73,8 @@ def __init__(
 
     async def handle_async_init(self) -> None:
         """Handle async initialization of the provider."""
-        # Configure torch thread caps before loading any model (see the controller method).
-        self.mass.streams.audio_analysis.ensure_thread_caps_configured()
+        # Configure the inference runtime before loading any model (see the controller method).
+        self.mass.streams.audio_analysis.ensure_inference_runtime_configured()
         (
             self._beat_this_model,
             self._beat_this_post_processor,
@@ -88,8 +88,7 @@ def _initialize_models(self) -> tuple[Any, ...]:
         """Initialize ML models (runs in a thread to avoid blocking the event loop)."""
         beat_this_model = Spect2Frames(checkpoint_path="small0", device=self._device)
         # torch aarch64 wheels advertise fbgemm in supported_engines but its kernels are x86-only.
-        is_arm = platform.machine().lower() in ("arm64", "aarch64", "armv8l", "armv7l")
-        preference = ("qnnpack", "fbgemm") if is_arm else ("fbgemm", "qnnpack")
+        preference = ("qnnpack", "fbgemm") if is_arm() else ("fbgemm", "qnnpack")
         supported_engines = torch.backends.quantized.supported_engines
         quantized_engine = next((e for e in preference if e in supported_engines), None)
         if quantized_engine is not None and torch.backends.quantized.engine != quantized_engine:
diff --git a/music_assistant/providers/sonic_analysis/__init__.py b/music_assistant/providers/sonic_analysis/__init__.py
@@ -338,8 +338,8 @@ async def handle_async_init(self) -> None:
             min_cpu_cores=MIN_CPU_CORES,
             require_ml_inference=True,
         )
-        # Configure torch thread caps before loading the model (see the controller method).
-        self.mass.streams.audio_analysis.ensure_thread_caps_configured()
+        # Configure the inference runtime before loading the model (see the controller method).
+        self.mass.streams.audio_analysis.ensure_inference_runtime_configured()
         (
             self._clap_model,
             self._clap_text_embeddings,
diff --git a/music_assistant/providers/sonic_analysis/manifest.json b/music_assistant/providers/sonic_analysis/manifest.json
@@ -8,7 +8,8 @@
     "transformers==5.6.2",
     "huggingface-hub==1.12.0",
     "PyYAML==6.0.3",
-    "torchlibrosa==0.1.0"
+    "torchlibrosa==0.1.0",
+    "threadpoolctl==3.6.0"
   ],
   "credits": [
     "[Microsoft CLAP](https://github.com/microsoft/CLAP)",
diff --git a/requirements_all.txt b/requirements_all.txt
@@ -86,6 +86,7 @@ soundcloudpy==0.1.4
 sounddevice==0.5.5
 srptools>=1.0.0
 sxm==0.2.8
+threadpoolctl==3.6.0
 torch==2.11.0+cpu; sys_platform == 'linux' and platform_machine == 'x86_64'
 torch==2.11.0; sys_platform != 'linux' or platform_machine != 'x86_64'
 torchaudio==2.11.0+cpu; sys_platform == 'linux' and platform_machine == 'x86_64'
diff --git a/tests/controllers/streams/test_audio_analysis.py b/tests/controllers/streams/test_audio_analysis.py
@@ -47,13 +47,19 @@ async def test_distribute_chunk_calls_all_providers() -> None:
     p2.process_pcm_chunk.assert_awaited_once_with(session_key, b"\x00" * 1024)
 
 
-def test_ensure_thread_caps_configured_is_idempotent() -> None:
-    """Torch thread caps are applied once per controller, however many providers init."""
+def test_ensure_inference_runtime_configured_is_idempotent() -> None:
+    """The inference runtime (torch + native BLAS caps) is configured once per controller."""
     controller = _make_controller()
-    with patch("torch.set_num_threads") as set_threads, patch("torch.set_num_interop_threads"):
-        controller.ensure_thread_caps_configured()
-        controller.ensure_thread_caps_configured()
+    with (
+        patch("torch.set_num_threads") as set_threads,
+        patch("torch.set_num_interop_threads"),
+        patch("threadpoolctl.threadpool_limits") as blas_limits,
+        patch("torch.backends.nnpack.set_flags"),
+    ):
+        controller.ensure_inference_runtime_configured()
+        controller.ensure_inference_runtime_configured()
     set_threads.assert_called_once()
+    blas_limits.assert_called_once()
 
 
 @pytest.mark.asyncio
diff --git a/tests/core/test_helpers.py b/tests/core/test_helpers.py
@@ -527,3 +527,19 @@ def test_system_meets_requirements(cpu_cores: int, total_gb: float, expected: bo
         patch("music_assistant.helpers.util.get_total_system_memory", return_value=total_gb),
     ):
         assert util.system_meets_requirements(min_memory_gb=6.0, min_cpu_cores=4) is expected
+
+
+@pytest.mark.parametrize(
+    ("machine", "expected"),
+    [
+        ("aarch64", True),
+        ("arm64", True),
+        ("armv7l", True),
+        ("x86_64", False),
+        ("AMD64", False),
+    ],
+)
+def test_is_arm(machine: str, expected: bool) -> None:
+    """is_arm recognizes 32/64-bit ARM and rejects x86."""
+    with patch("music_assistant.helpers.util.platform.machine", return_value=machine):
+        assert util.is_arm() is expected

Original file line number	Diff line number	Diff line change
`@@ -338,8 +338,8 @@ async def handle_async_init(self) -> None:`
`338`	`338`	`min_cpu_cores=MIN_CPU_CORES,`
`339`	`339`	`require_ml_inference=True,`
`340`	`340`	`)`
`341`		`- # Configure torch thread caps before loading the model (see the controller method).`
`342`		`- self.mass.streams.audio_analysis.ensure_thread_caps_configured()`
	`341`	`+ # Configure the inference runtime before loading the model (see the controller method).`
	`342`	`+ self.mass.streams.audio_analysis.ensure_inference_runtime_configured()`
`343`	`343`	`(`
`344`	`344`	`self._clap_model,`
`345`	`345`	`self._clap_text_embeddings,`