fix(services): tighten error handling on registry + eviction + sighup paths

jonpspri · jonpspri · commit d5bc8bc8b117 · 2026-04-18T20:06:32.000+01:00
Review pass #2 caught a cluster of silent-failure concerns around the registry hot paths and the eviction hooks that now wire it to gateway mutations, downstream DELETE, classification, and SIGHUP. Fixes land as one commit because they share a design: narrow the catches, raise the log level where a failure leaves state wrong, and introduce a dedicated exception type so "registry not initialised" stops hiding other RuntimeErrors. mcpgateway/services/upstream_session_registry.py: - New RegistryNotInitializedError(RuntimeError) so catch-sites can distinguish the "not started yet" case from other runtime errors (e.g. "Event loop is closed" during shutdown). Inherits RuntimeError for backwards compatibility with catch-sites written pre-split. - _probe_health: narrow the catch-all "except Exception → recreate" to (OSError, TimeoutError, McpError). AttributeError from MCP SDK drift, authorization errors, and other genuinely-unexpected conditions now propagate instead of driving an infinite reconnect loop against the same failure. - _default_session_factory.owner(): change except BaseException to except Exception so SystemExit / KeyboardInterrupt / CancelledError propagate promptly during shutdown. Add an add_done_callback that logs a warning if the owner task exits unexpectedly — previously a post-init upstream death silently left an orphaned session in self._sessions. - is_closed: bump the MCP-internals introspection except from pass to logger.debug with the exception type, so SDK drift is visible. - acquire(): wrap the yield in try/except (OSError, anyio.ClosedResourceError, anyio.BrokenResourceError). On transport-level errors from the caller body, evict so the next acquire rebuilds instead of handing back a dead session. Tool- level errors still leave the session in place. - close_all(): asyncio.gather the per-key evictions (with return_exceptions=True). Previously serial with per-key 5s cap — 50 stuck sessions = 4+ minute shutdown stall. mcpgateway/services/gateway_service.py _evict_upstream_sessions_for_gateway: - Catch RegistryNotInitializedError specifically for the tests/early- startup no-op case. Bump the generic-exception branch from debug to warning with gateway_id + exception type — this fires POST- commit, so a silent eviction failure leaves persisted stale credentials/URL/TLS material pinned on in-flight upstream sessions. mcpgateway/cache/session_registry.py remove_session: - Same RegistryNotInitializedError / warning treatment. An orphaned upstream after DELETE /mcp is otherwise invisible to ops. mcpgateway/services/server_classification_service.py _perform_classification: - Bump the Redis-purge catch from debug to warning. The entire point of this method is to KEEP classification keys absent so should_poll_server falls through to "always poll". A sustained purge failure re-opens the very regression this method exists to prevent (previous commit's Codex review fix). mcpgateway/handlers/signal_handlers.py sighup_reload: - Add upstream-registry drain between the SSL cache clear and the affinity-mapping drain. Previously SIGHUP only refreshed SSL contexts and cleared the affinity map — registry-held upstream ClientSessions kept their stale TLS material on the socket. - Catch RegistryNotInitializedError at debug for the uninitialised case; warning for other drain failures. Tests: - test_upstream_session_registry.py FakeClientSession now raises OSError (was RuntimeError) to match the narrowed _probe_health catch — the test's intent was "broken transport → recreate" and OSError is the accurate stand-in. - test_main_sighup.py: rewritten for the new three-step drain. Asserts SSL cache clear + registry.close_all() + affinity drain all fire, with the new log-message strings. Added a test covering the RegistryNotInitializedError debug-path branch. 529 related tests pass across registry + lifecycle + classification + tool_service + cache + sighup suites. Signed-off-by: Jonathan Springer <jps@s390x.com>
diff --git a/mcpgateway/cache/session_registry.py b/mcpgateway/cache/session_registry.py
@@ -1242,18 +1242,28 @@ def _db_remove() -> None:
                 logger.error(f"Database error removing session {session_id}: {e}")
 
         # #4205: close any upstream MCP sessions this downstream session owned.
-        # Wrapped because the registry may not be initialized in every context
-        # (unit tests instantiate SessionRegistry directly), and an eviction
-        # failure must not interfere with downstream session teardown.
-        try:
-            # First-Party
-            from mcpgateway.services.upstream_session_registry import get_upstream_session_registry  # pylint: disable=import-outside-toplevel
+        # Wrapped because (a) the registry may not be initialized in tests or
+        # very early bootstrap, and (b) an eviction failure must not interfere
+        # with downstream-session teardown. Eviction failure is logged at
+        # warning rather than debug because it leaves an orphaned upstream
+        # session whose presence is otherwise invisible to ops.
+        # First-Party
+        from mcpgateway.services.upstream_session_registry import (  # pylint: disable=import-outside-toplevel
+            RegistryNotInitializedError,
+            get_upstream_session_registry,
+        )
 
+        try:
             await get_upstream_session_registry().evict_session(session_id)
-        except RuntimeError:
-            pass  # Registry not initialized (tests, early shutdown) — nothing to do.
-        except Exception as exc:
-            logger.debug(f"Upstream session eviction for {session_id} failed: {exc}")
+        except RegistryNotInitializedError:
+            pass  # Nothing to evict — tests or very-early bootstrap.
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(
+                "Upstream session eviction for downstream session %s failed (%s: %s); " "an orphaned upstream session may persist until its owner task exits",
+                session_id,
+                type(exc).__name__,
+                exc,
+            )
 
         logger.info(f"Removed session: {session_id}")
 
diff --git a/mcpgateway/handlers/signal_handlers.py b/mcpgateway/handlers/signal_handlers.py
@@ -13,11 +13,16 @@
 
 
 async def sighup_reload() -> None:
-    """Clear SSL context cache and drain MCP session pool on SIGHUP for certificate rotation.
+    """Clear SSL context cache + drain upstream sessions on SIGHUP for certificate rotation.
 
-    Clears the SSL context cache to force recreation of SSL contexts
-    with potentially updated certificates, and drains the MCP session
-    pool so pooled connections reconnect with new TLS state.
+    Three things have to happen in order for new TLS material to take effect
+    on a worker without restart:
+      1. Clear the SSL context cache so the next build uses new certs.
+      2. Close every in-process upstream MCP session — they hold their TLS
+         context on the socket and would keep using the old certs forever.
+      3. Drain the session-affinity in-memory mapping so the next downstream
+         request re-registers (Redis state survives; only the local fast-
+         path cache is cleared).
     """
     try:
         # First-Party
@@ -28,14 +33,31 @@ async def sighup_reload() -> None:
     except Exception as exc:
         logger.error(f"SIGHUP handler failed to clear SSL context cache: {exc}")
 
+    # #4205: upstream MCP sessions live in the registry now — draining only
+    # the affinity mapping was leaving stale TLS contexts pinned to registry-
+    # held ClientSessions.
+    try:
+        # First-Party
+        from mcpgateway.services.upstream_session_registry import (  # pylint: disable=import-outside-toplevel
+            RegistryNotInitializedError,
+            get_upstream_session_registry,
+        )
+
+        await get_upstream_session_registry().close_all()
+        logger.info("SIGHUP: upstream session registry drained for TLS rotation")
+    except RegistryNotInitializedError:
+        logger.debug("SIGHUP: upstream session registry not initialised; skipping drain")
+    except Exception as exc:
+        logger.warning(f"SIGHUP: upstream session registry drain failed: {exc}")
+
     try:
         # First-Party
         from mcpgateway.services.session_affinity import drain_session_affinity  # pylint: disable=import-outside-toplevel
 
         await drain_session_affinity()
-        logger.info("SIGHUP: MCP session pool drained for TLS rotation")
+        logger.info("SIGHUP: session-affinity mapping drained")
     except Exception as exc:
-        logger.debug(f"SIGHUP: MCP session pool drain skipped: {exc}")
+        logger.debug(f"SIGHUP: session-affinity drain skipped: {exc}")
 
 
 def sighup_handler(_signum: int, _frame: Any) -> None:
diff --git a/mcpgateway/services/gateway_service.py b/mcpgateway/services/gateway_service.py
@@ -446,15 +446,26 @@ async def _evict_upstream_sessions_for_gateway(gateway_id: str) -> int:
         The number of upstream sessions evicted (0 if the registry is
         unavailable or nothing matched).
     """
-    try:
-        # First-Party
-        from mcpgateway.services.upstream_session_registry import get_upstream_session_registry  # pylint: disable=import-outside-toplevel
+    # First-Party
+    from mcpgateway.services.upstream_session_registry import (  # pylint: disable=import-outside-toplevel
+        RegistryNotInitializedError,
+        get_upstream_session_registry,
+    )
 
+    try:
         return await get_upstream_session_registry().evict_gateway(gateway_id)
-    except RuntimeError:
+    except RegistryNotInitializedError:
+        # Unit tests / very-early startup — nothing to evict by definition.
         return 0
-    except Exception as exc:  # noqa: BLE001 — log and swallow; see docstring
-        logger.debug(f"Upstream session eviction for gateway {gateway_id} failed: {exc}")
+    except Exception as exc:  # noqa: BLE001 — see docstring; logged at warning because this
+        # fires POST-commit: auth / URL / TLS change is already persisted, so a silent eviction
+        # failure leaves in-flight downstream sessions talking to the stale gateway state.
+        logger.warning(
+            "Upstream session eviction for gateway %s failed (%s: %s); stale sessions may " "persist until their downstream session ends",
+            gateway_id,
+            type(exc).__name__,
+            exc,
+        )
         return 0
 
 
diff --git a/mcpgateway/services/server_classification_service.py b/mcpgateway/services/server_classification_service.py
@@ -251,8 +251,16 @@ async def _perform_classification(self) -> None:
                     self.CLASSIFICATION_METADATA_KEY,
                     self.CLASSIFICATION_TIMESTAMP_KEY,
                 )
-            except Exception as exc:  # noqa: BLE001 — best-effort purge
-                logger.debug(f"Classification key purge failed: {exc}")
+            except Exception as exc:  # noqa: BLE001
+                # Warn rather than debug: the whole point of this cycle is to KEEP the
+                # classification keys absent so should_poll_server falls through to
+                # "poll now". A sustained purge failure re-opens the exact regression
+                # this method exists to prevent (#4205 follow-up). See the docstring.
+                logger.warning(
+                    "Classification key purge failed (%s: %s); stale hot/cold state " "may linger in Redis and bias should_poll_server toward the cold schedule",
+                    type(exc).__name__,
+                    exc,
+                )
 
     async def get_server_classification(self, url: str) -> Optional[str]:
         """Get classification for a server (hot/cold).
diff --git a/mcpgateway/services/upstream_session_registry.py b/mcpgateway/services/upstream_session_registry.py
@@ -170,8 +170,12 @@ def is_closed(self) -> bool:
                     open_rx = getattr(state, "open_receive_channels", 1)
                     if isinstance(open_rx, int) and open_rx == 0:
                         return True
-        except Exception:  # nosec B110 — degrade gracefully if MCP internals shift
-            pass
+        except Exception as exc:  # noqa: BLE001 — degrade gracefully if MCP internals shift
+            logger.debug(
+                "is_closed introspection on ClientSession internals raised %s: %s; " "next acquire will fall back to owner-task liveness only",
+                type(exc).__name__,
+                exc,
+            )
         return False
 
 
@@ -225,7 +229,7 @@ async def owner() -> None:
                 if req.message_handler_factory is not None:
                     try:
                         message_handler = req.message_handler_factory(req.url, req.gateway_id)
-                    except Exception as exc:
+                    except Exception as exc:  # noqa: BLE001 — handler failure is not fatal
                         logger.warning(
                             "Failed to build message handler for %s: %s",
                             sanitize_url_for_logging(req.url),
@@ -238,12 +242,32 @@ async def owner() -> None:
                     # Block until the registry signals shutdown; do NOT rely on
                     # task cancellation from a request handler (see class docs).
                     await shutdown_event.wait()
-        except BaseException as exc:
+        except Exception as exc:  # noqa: BLE001 — see below
+            # Broad catch on purpose: the upstream-setup path runs many
+            # third-party coroutines (httpx, anyio, MCP SDK) whose exception
+            # classes we cannot enumerate. BaseException is deliberately NOT
+            # caught — SystemExit / KeyboardInterrupt / CancelledError must
+            # propagate so the task exits promptly during shutdown.
             if not ready.done():
                 ready.set_exception(RuntimeError(f"Failed to create upstream MCP session for {req.url}: {exc}"))
 
     task = asyncio.create_task(owner(), name=f"upstream-session-{sanitize_url_for_logging(req.url)}")
 
+    def _log_owner_exit(done_task: asyncio.Task) -> None:
+        """Surface unexpected owner-task deaths so an orphaned upstream session is visible to ops."""
+        if done_task.cancelled():
+            return
+        exc = done_task.exception()
+        if exc is not None:
+            logger.warning(
+                "Upstream MCP owner task for %s exited with %s: %s — upstream session may be orphaned",
+                sanitize_url_for_logging(req.url),
+                type(exc).__name__,
+                exc,
+            )
+
+    task.add_done_callback(_log_owner_exit)
+
     success = False
     try:
         session, transport_ctx_ref = await asyncio.wait_for(ready, timeout=req.timeout_seconds)
@@ -255,7 +279,7 @@ async def owner() -> None:
             with anyio.move_on_after(_DEFAULT_SHUTDOWN_TIMEOUT_SECONDS):
                 try:
                     await task
-                except BaseException:  # nosec B110
+                except (asyncio.CancelledError, Exception):  # noqa: BLE001 — cleanup swallow
                     pass
 
     # Smuggle the task + shutdown event onto the transport_ctx so the registry
@@ -373,7 +397,24 @@ async def acquire(
             session.last_used = time.time()
             session.use_count += 1
 
-        yield session
+        # Hand out the session with no lock held: MCP ClientSession multiplexes
+        # concurrent requests over its transport via JSON-RPC ids, so there's no
+        # reason to serialize callers. If the caller's body raises a transport-
+        # level error (server closed the stream, socket broke), evict so the
+        # next acquire rebuilds instead of handing out a dead session.
+        try:
+            yield session
+        except (OSError, anyio.ClosedResourceError, anyio.BrokenResourceError) as exc:
+            logger.info(
+                "acquire() caller raised %s for gateway=%s; evicting upstream so next acquire rebuilds",
+                type(exc).__name__,
+                gateway_id,
+            )
+            await self._evict_key(key)
+            raise
+        # All other exceptions (tool-level errors from the upstream, caller
+        # application errors) intentionally leave the session in place — the
+        # transport is fine, the caller just didn't like the result.
 
     async def evict_session(self, downstream_session_id: str) -> int:
         """Close and remove every upstream session owned by this downstream session id.
@@ -403,11 +444,19 @@ async def evict_gateway(self, gateway_id: str) -> int:
         return count
 
     async def close_all(self) -> None:
-        """Drain every upstream session. Intended for app shutdown."""
+        """Drain every upstream session concurrently. Intended for app shutdown.
+
+        Each ``_evict_key`` can take up to ``shutdown_timeout_seconds`` waiting
+        for the owner task to exit; running them in series on a worker with
+        dozens of downstream sessions would turn shutdown into a multi-minute
+        stall. ``asyncio.gather`` caps the total drain at roughly
+        ``shutdown_timeout_seconds`` plus a small constant.
+        """
         async with self._global_lock:
             keys = list(self._sessions.keys())
-        for key in keys:
-            await self._evict_key(key)
+        if not keys:
+            return
+        await asyncio.gather(*[self._evict_key(k) for k in keys], return_exceptions=True)
 
     def snapshot(self) -> RegistrySnapshot:
         """Return a point-in-time copy of the registry's counters."""
@@ -495,7 +544,17 @@ async def _create_session(
         )
 
     async def _probe_health(self, upstream: UpstreamSession) -> bool:
-        """Run the health check chain against an idle session. Returns False if all probes fail."""
+        """Run the health check chain against an idle session. Returns False if all probes fail.
+
+        Exception policy: we ADVANCE on ``TimeoutError`` and on
+        ``McpError(METHOD_NOT_FOUND)`` (the server chose not to implement
+        this probe), and we FAIL FAST on everything else transport- or
+        protocol-level (``OSError`` / anyio stream errors / other ``McpError``s)
+        — recreating a session on "permission denied" or "request too large"
+        would loop against the same failure. Genuinely unexpected exceptions
+        (``AttributeError`` from SDK drift, etc.) propagate so they surface in
+        telemetry instead of silently triggering a reconnect loop.
+        """
         for method in _HEALTH_CHECK_CHAIN:
             try:
                 if method == "skip":
@@ -517,7 +576,8 @@ async def _probe_health(self, upstream: UpstreamSession) -> bool:
                 return False
             except TimeoutError:
                 continue
-            except Exception:
+            except OSError:
+                # Socket / stream error — upstream is dead.
                 self._metrics.health_check_failures += 1
                 return False
         self._metrics.health_check_failures += 1
@@ -562,6 +622,17 @@ class _AcquireDecision(Enum):
 _registry: Optional[UpstreamSessionRegistry] = None
 
 
+class RegistryNotInitializedError(RuntimeError):
+    """Raised when ``get_upstream_session_registry()`` is called before startup init.
+
+    Callers that need to distinguish "registry not available yet" from other
+    runtime errors (so they can silently no-op in tests / early bootstrap
+    without also swallowing unrelated ``RuntimeError``s like "Event loop is
+    closed") should catch this type specifically. Inherits ``RuntimeError``
+    for backwards compatibility with catch-sites written before the split.
+    """
+
+
 def init_upstream_session_registry(
     *,
     message_handler_factory: Optional[MessageHandlerFactory] = None,
@@ -574,9 +645,9 @@ def init_upstream_session_registry(
 
 
 def get_upstream_session_registry() -> UpstreamSessionRegistry:
-    """Return the process-wide registry or raise if it has not been initialized."""
+    """Return the process-wide registry or raise ``RegistryNotInitializedError``."""
     if _registry is None:
-        raise RuntimeError("UpstreamSessionRegistry has not been initialized; call init_upstream_session_registry() first")
+        raise RegistryNotInitializedError("UpstreamSessionRegistry has not been initialized; call init_upstream_session_registry() first")
     return _registry
 
 
diff --git a/tests/unit/mcpgateway/services/test_upstream_session_registry.py b/tests/unit/mcpgateway/services/test_upstream_session_registry.py
@@ -56,12 +56,15 @@ async def send_ping(self) -> None:
         if self.probe_exception is not None:
             raise self.probe_exception
         if not self.healthy:
-            raise RuntimeError("ping failed")
+            # Use a transport-level error — production _probe_health narrows its
+            # catch to (OSError, ...) so unexpected exception classes propagate
+            # as signals of SDK drift rather than silent reconnect loops.
+            raise OSError("ping failed")
 
     async def list_tools(self) -> None:
         self.list_tools_calls += 1
         if not self.healthy:
-            raise RuntimeError("list_tools failed")
+            raise OSError("list_tools failed")
 
 
 def _make_fake_factory():
diff --git a/tests/unit/mcpgateway/test_main_sighup.py b/tests/unit/mcpgateway/test_main_sighup.py