From 06cdb3a444f41a5e2aa8f001e9366b59fa2e7ae0 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 16 Apr 2026 18:31:43 +0200 Subject: [PATCH 01/47] Replace notification polling with Server-Sent Events (SSE) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add real-time notification delivery via SSE to replace the 30-second polling interval. The SSE endpoint streams notification_update, broadcast_update, and notification_status events to connected clients. Backend: - New SSEConnectionManager (lib/galaxy/managers/sse.py) maps user IDs to asyncio queues with thread-safe push via call_soon_threadsafe - SSE streaming endpoint at GET /api/notifications/stream with Last-Event-ID catch-up support and 30s keepalive - Kombu control tasks (notify_users, notify_broadcast) fan out events across all Galaxy worker processes - Existing polling API unchanged for backward compatibility Frontend: - New useNotificationSSE composable using EventSource with auto-reconnect - notificationsStore tries SSE first, falls back to polling after 5+ consecutive errors Tests: - API integration tests for SSE event delivery, broadcasts, and reconnect - Selenium E2E tests for notification appearance and bell indicator Add SSE-based real-time history update notifications Replace aggressive 3-second history polling with Server-Sent Events driven by database change detection, configurable via admin setting. Backend: - Add pg_notify() to PostgreSQL audit triggers for instant LISTEN/NOTIFY - New HistoryAuditMonitor: PG LISTEN/NOTIFY with SQLite polling fallback - New /api/events/stream SSE endpoint (uses StructuredApp, not MinimalManagerApp) - Kombu control task "history_update" with message TTL (expiration=10s) - Config: enable_sse_history_updates, history_audit_monitor_poll_interval Frontend: - Generalize useNotificationSSE → useSSE composable with event type filtering - historyStore connects SSE for history_update events, triggers immediate refresh - notificationsStore updated to use /api/events/stream and useSSE - Polling kept as fallback at existing intervals Tests: - 5 integration tests: endpoint, dataset upload event, history ID in payload, cross-user isolation, polling backward compatibility --- client/src/api/schema/schema.ts | 133 +++++++++ client/src/composables/useNotificationSSE.ts | 84 ++++++ client/src/stores/historyStore.ts | 31 ++- client/src/stores/notificationsStore.ts | 109 +++++++- doc/source/admin/galaxy_options.rst | 28 ++ lib/galaxy/app/__init__.py | 101 +++++-- lib/galaxy/config/sample/galaxy.yml.sample | 14 + lib/galaxy/config/schemas/config_schema.yml | 20 ++ lib/galaxy/managers/history_audit_monitor.py | 263 ++++++++++++++++++ lib/galaxy/managers/notification.py | 62 ++++- lib/galaxy/managers/sse.py | 227 +++++++++++++++ .../model/triggers/update_audit_table.py | 7 + lib/galaxy/queue_worker/__init__.py | 111 +++++++- lib/galaxy/structured_app/__init__.py | 6 +- lib/galaxy/webapps/galaxy/api/events.py | 66 +++++ .../webapps/galaxy/api/notifications.py | 37 +++ .../webapps/galaxy/services/notifications.py | 23 ++ lib/galaxy_test/base/sse.py | 136 +++++++++ test/integration/test_history_sse.py | 131 +++++++++ test/integration/test_notification_sse.py | 186 +++++++++++++ .../test_notification_sse.py | 110 ++++++++ 21 files changed, 1837 insertions(+), 48 deletions(-) create mode 100644 client/src/composables/useNotificationSSE.ts create mode 100644 lib/galaxy/managers/history_audit_monitor.py create mode 100644 lib/galaxy/managers/sse.py create mode 100644 lib/galaxy/webapps/galaxy/api/events.py create mode 100644 lib/galaxy_test/base/sse.py create mode 100644 test/integration/test_history_sse.py create mode 100644 test/integration/test_notification_sse.py create mode 100644 test/integration_selenium/test_notification_sse.py diff --git a/client/src/api/schema/schema.ts b/client/src/api/schema/schema.ts index b171e21e0f37..62a3f2f0a51e 100644 --- a/client/src/api/schema/schema.ts +++ b/client/src/api/schema/schema.ts @@ -1292,6 +1292,33 @@ export interface paths { patch?: never; trace?: never; }; + "/api/events/stream": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Server-Sent Events stream for real-time updates. + * @description Opens a Server-Sent Events (SSE) connection that pushes real-time + * updates for notifications, history changes, and other events. + * + * On reconnect, the browser sends the ``Last-Event-ID`` header automatically. + * If the notification system is enabled, any notifications created since that + * timestamp are delivered as a catch-up ``notification_status`` event. + * + * Anonymous users receive only broadcast events. + */ + get: operations["stream_events_api_events_stream_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/exports": { parameters: { query?: never; @@ -3968,6 +3995,32 @@ export interface paths { patch?: never; trace?: never; }; + "/api/notifications/stream": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** + * Server-Sent Events stream for real-time notification updates. + * @description Opens a Server-Sent Events (SSE) connection that pushes notification updates in real-time. + * + * On reconnect, the browser sends the ``Last-Event-ID`` header automatically. + * Any notifications created since that timestamp are delivered as a catch-up + * ``notification_status`` event before the stream begins. + * + * Anonymous users receive only broadcast events. + */ + get: operations["stream_notifications_api_notifications_stream_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/notifications/{notification_id}": { parameters: { query?: never; @@ -33209,6 +33262,46 @@ export interface operations { }; }; }; + stream_events_api_events_stream_get: { + parameters: { + query?: never; + header?: { + "Last-Event-ID"?: string | null; + /** @description The user ID that will be used to effectively make this API call. Only admins and designated users can make API calls on behalf of other users. */ + "run-as"?: string | null; + }; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Request Error */ + "4XX": { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["MessageExceptionModel"]; + }; + }; + /** @description Server Error */ + "5XX": { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["MessageExceptionModel"]; + }; + }; + }; + }; index_api_exports_get: { parameters: { query?: { @@ -42407,6 +42500,46 @@ export interface operations { }; }; }; + stream_notifications_api_notifications_stream_get: { + parameters: { + query?: never; + header?: { + "Last-Event-ID"?: string | null; + /** @description The user ID that will be used to effectively make this API call. Only admins and designated users can make API calls on behalf of other users. */ + "run-as"?: string | null; + }; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content?: never; + }; + /** @description Request Error */ + "4XX": { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["MessageExceptionModel"]; + }; + }; + /** @description Server Error */ + "5XX": { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["MessageExceptionModel"]; + }; + }; + }; + }; show_notification_api_notifications__notification_id__get: { parameters: { query?: never; diff --git a/client/src/composables/useNotificationSSE.ts b/client/src/composables/useNotificationSSE.ts new file mode 100644 index 000000000000..84b03373cfa7 --- /dev/null +++ b/client/src/composables/useNotificationSSE.ts @@ -0,0 +1,84 @@ +import { onScopeDispose, ref } from "vue"; + +import { withPrefix } from "@/utils/redirect"; + +/** + * All SSE event types the server may emit. + */ +export const SSE_EVENT_TYPES = [ + "notification_update", + "broadcast_update", + "notification_status", + "history_update", +] as const; + +export type SSEEventType = (typeof SSE_EVENT_TYPES)[number]; + +/** + * Composable for connecting to the unified SSE event stream. + * + * The browser's EventSource handles reconnection automatically and + * sends the Last-Event-ID header so the server can catch up on missed events. + * + * @param onEvent - callback invoked for every SSE event + * @param eventTypes - subset of event types to listen to (defaults to all) + */ +export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: readonly SSEEventType[] = SSE_EVENT_TYPES) { + const connected = ref(false); + let eventSource: EventSource | null = null; + let consecutiveErrors = 0; + + function connect() { + disconnect(); + consecutiveErrors = 0; + const url = withPrefix("/api/events/stream"); + eventSource = new EventSource(url); + + for (const eventType of eventTypes) { + eventSource.addEventListener(eventType, onEvent); + } + + eventSource.onopen = () => { + connected.value = true; + consecutiveErrors = 0; + // Expose a global readiness flag so Selenium tests can distinguish + // a working SSE pipeline from the polling fallback. + (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = true; + }; + + eventSource.onerror = () => { + connected.value = false; + (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = false; + consecutiveErrors++; + // EventSource auto-reconnects, but if we get too many errors + // in a row, the server likely doesn't support SSE — give up + // and let the caller fall back to polling. + if (consecutiveErrors > 5) { + disconnect(); + } + }; + } + + function disconnect() { + if (eventSource) { + for (const eventType of eventTypes) { + eventSource.removeEventListener(eventType, onEvent); + } + eventSource.close(); + eventSource = null; + } + connected.value = false; + (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = false; + } + + onScopeDispose(() => { + disconnect(); + }); + + return { connect, disconnect, connected }; +} + +/** + * @deprecated Use `useSSE` instead. This alias exists for backward compatibility. + */ +export const useNotificationSSE = useSSE; diff --git a/client/src/stores/historyStore.ts b/client/src/stores/historyStore.ts index 019c57de8ae4..1d6c40acebc8 100644 --- a/client/src/stores/historyStore.ts +++ b/client/src/stores/historyStore.ts @@ -15,6 +15,7 @@ import type { ArchivedHistoryDetailed } from "@/api/histories.archived"; import { getGalaxyInstance } from "@/app"; import { HistoryFilters } from "@/components/History/HistoryFilters"; import { useResourceWatcher } from "@/composables/resourceWatcher"; +import { useSSE } from "@/composables/useNotificationSSE"; import { useUserLocalStorage } from "@/composables/userLocalStorage"; import { createAndSelectNewHistory, @@ -31,6 +32,7 @@ import { ACTIVE_POLLING_INTERVAL, INACTIVE_POLLING_INTERVAL, watchHistory as watchHistorySuppliedApp, + watchHistoryOnce as watchHistoryOnceSuppliedApp, } from "@/watch/watchHistory"; const PAGINATION_LIMIT = 10; @@ -391,6 +393,26 @@ export const useHistoryStore = defineStore("historyStore", () => { return watchHistorySuppliedApp(app); } + // SSE-driven history updates: when we receive a history_update event, + // immediately trigger a refresh of the current history + const SSE_HISTORY_EVENT_TYPES = ["history_update"] as const; + const { connect: sseHistoryConnect } = useSSE(handleHistorySSEEvent, SSE_HISTORY_EVENT_TYPES); + + function handleHistorySSEEvent(event: MessageEvent) { + try { + const data = JSON.parse(event.data); + const changedHistoryIds: string[] = data.history_ids ?? []; + // If the current history was updated, trigger a refresh + if (currentHistoryId.value && changedHistoryIds.includes(currentHistoryId.value)) { + const app = getGalaxyInstance(); + watchHistoryOnceSuppliedApp(app); + } + } catch (e) { + console.error("Error handling history SSE event:", e); + } + } + + // Polling fallback — keeps running as a safety net even when SSE is connected const { startWatchingResource: startWatchingHistory, stopWatchingResource: stopWatchingHistory, @@ -400,6 +422,13 @@ export const useHistoryStore = defineStore("historyStore", () => { longPollingInterval: INACTIVE_POLLING_INTERVAL, }); + function startWatchingHistoryWithSSE() { + // Always start polling as a baseline + startWatchingHistory(); + // Also connect SSE for instant updates + sseHistoryConnect(); + } + async function loadHistoryById(historyId: string) { if (!isLoadingHistory.has(historyId)) { isLoadingHistory.add(historyId); @@ -525,7 +554,7 @@ export const useHistoryStore = defineStore("historyStore", () => { restoreHistory, restoreHistories, handleTotalCountChange, - startWatchingHistory, + startWatchingHistory: startWatchingHistoryWithSSE, stopWatchingHistory, isWatchingHistory, loadCurrentHistory, diff --git a/client/src/stores/notificationsStore.ts b/client/src/stores/notificationsStore.ts index 2a154885d999..220bd8e796ac 100644 --- a/client/src/stores/notificationsStore.ts +++ b/client/src/stores/notificationsStore.ts @@ -1,9 +1,10 @@ import { defineStore } from "pinia"; -import { computed, ref } from "vue"; +import { computed, ref, watch } from "vue"; import { GalaxyApi } from "@/api"; import type { NotificationChanges, UserNotification, UserNotificationsBatchUpdateRequest } from "@/api/notifications"; import { useResourceWatcher } from "@/composables/resourceWatcher"; +import { useSSE } from "@/composables/useNotificationSSE"; import { rethrowSimple } from "@/utils/simple-error"; import { mergeObjectListsById } from "@/utils/utils"; @@ -13,11 +14,6 @@ const ACTIVE_POLLING_INTERVAL = 30000; // 30 seconds const INACTIVE_POLLING_INTERVAL = ACTIVE_POLLING_INTERVAL * 20; // 10 minutes export const useNotificationsStore = defineStore("notificationsStore", () => { - const { startWatchingResource: startWatchingNotifications, stopWatchingResource: stopWatchingNotifications } = - useResourceWatcher(getNotificationStatus, { - shortPollingInterval: ACTIVE_POLLING_INTERVAL, - longPollingInterval: INACTIVE_POLLING_INTERVAL, - }); const broadcastsStore = useBroadcastsStore(); const totalUnreadCount = ref(0); @@ -25,9 +21,82 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { const loadingNotifications = ref(false); const lastNotificationUpdate = ref(null); + const wantSSE = ref(true); const unreadNotifications = computed(() => notifications.value.filter((n) => !n.seen_time)); + // --- SSE setup (listen only for notification event types) --- + const NOTIFICATION_EVENT_TYPES = ["notification_update", "broadcast_update", "notification_status"] as const; + const { + connect: sseConnect, + disconnect: sseDisconnect, + connected: sseConnected, + } = useSSE(handleSSEEvent, NOTIFICATION_EVENT_TYPES); + + // --- Polling fallback --- + const { startWatchingResource: startPolling, stopWatchingResource: stopPolling } = useResourceWatcher( + getNotificationStatus, + { + shortPollingInterval: ACTIVE_POLLING_INTERVAL, + longPollingInterval: INACTIVE_POLLING_INTERVAL, + }, + ); + + function stopWatchingNotifications() { + sseDisconnect(); + stopPolling(); + } + + // When SSE connection drops and doesn't recover, fall back to polling + watch(sseConnected, (isConnected) => { + if (!isConnected && wantSSE.value) { + // SSE disconnected but we still want updates — don't start polling + // immediately, EventSource will auto-reconnect. Only if useSSE is + // set to false (after too many errors) do we fall back. + } + }); + + watch(wantSSE, (wantSSE) => { + if (!wantSSE) { + sseDisconnect(); + startPolling(); + } + }); + + function handleSSEEvent(event: MessageEvent) { + try { + const data = JSON.parse(event.data); + switch (event.type) { + case "notification_update": + notifications.value = mergeObjectListsById( + notifications.value, + [data as UserNotification], + "create_time", + "desc", + ); + updateUnreadCount(); + break; + case "broadcast_update": + broadcastsStore.updateBroadcasts([data]); + break; + case "notification_status": + // Full catch-up on reconnect (same shape as GET /api/notifications/status) + totalUnreadCount.value = data.total_unread_count; + notifications.value = mergeObjectListsById( + notifications.value, + data.notifications as UserNotification[], + "create_time", + "desc", + ); + broadcastsStore.updateBroadcasts(data.broadcasts); + break; + } + lastNotificationUpdate.value = new Date(); + } catch (e) { + console.error("Error handling SSE event:", e); + } + } + async function loadNotifications() { const { data, error } = await GalaxyApi().GET("/api/notifications"); @@ -76,6 +145,29 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { } } + async function startWatchingNotifications() { + // Always do an initial load first + if (!lastNotificationUpdate.value) { + try { + loadingNotifications.value = true; + await broadcastsStore.loadBroadcasts(); + await loadNotifications(); + updateUnreadCount(); + lastNotificationUpdate.value = new Date(); + } catch (e) { + console.error(e); + } finally { + loadingNotifications.value = false; + } + } + + if (wantSSE.value) { + sseConnect(); + } else { + startPolling(); + } + } + async function updateBatchNotification(request: UserNotificationsBatchUpdateRequest) { const { error } = await GalaxyApi().PUT("/api/notifications", { body: request, @@ -88,7 +180,10 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { if (request.changes.deleted) { notifications.value = notifications.value.filter((n) => !request.notification_ids.includes(n.id)); } - startWatchingNotifications(); + // If not using SSE, trigger a poll to refresh state + if (!sseConnected.value) { + startWatchingNotifications(); + } } async function updateNotification(notification: UserNotification, changes: NotificationChanges) { diff --git a/doc/source/admin/galaxy_options.rst b/doc/source/admin/galaxy_options.rst index 47e3aaacf343..da52bf02d820 100644 --- a/doc/source/admin/galaxy_options.rst +++ b/doc/source/admin/galaxy_options.rst @@ -5804,6 +5804,34 @@ :Type: str +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``enable_sse_history_updates`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + Enables real-time history update notifications via Server-Sent + Events (SSE). When enabled, a background monitor watches for + history changes (via PostgreSQL LISTEN/NOTIFY or audit table + polling as a fallback for SQLite) and pushes update signals to + connected browsers, replacing aggressive 3-second polling. +:Default: ``false`` +:Type: bool + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``history_audit_monitor_poll_interval`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + The interval in seconds between history audit table polls when + using the polling fallback (SQLite or when PostgreSQL + LISTEN/NOTIFY is unavailable). Only used when + enable_sse_history_updates is true. Lower values mean faster + updates but more database queries. Recommended range: 1-5 seconds. +:Default: ``2`` +:Type: int + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``enable_notification_system`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/lib/galaxy/app/__init__.py b/lib/galaxy/app/__init__.py index e8b8ef1b9410..672efde7dc6e 100644 --- a/lib/galaxy/app/__init__.py +++ b/lib/galaxy/app/__init__.py @@ -77,6 +77,10 @@ from galaxy.managers.object_store_instances import UserObjectStoreResolverImpl from galaxy.managers.roles import RoleManager from galaxy.managers.session import GalaxySessionManager +from galaxy.managers.sse import ( + SSEConnectionManager, + SSEEventDispatcher, +) from galaxy.managers.tasks import ( AsyncTasksManager, CeleryAsyncTasksManager, @@ -150,8 +154,12 @@ ) from galaxy.tool_shed.cache import ToolShedRepositoryCache from galaxy.tool_shed.galaxy_install.client import InstallationTarget -from galaxy.tool_shed.galaxy_install.installed_repository_manager import InstalledRepositoryManager -from galaxy.tool_shed.galaxy_install.update_repository_manager import UpdateRepositoryManager +from galaxy.tool_shed.galaxy_install.installed_repository_manager import ( + InstalledRepositoryManager, +) +from galaxy.tool_shed.galaxy_install.update_repository_manager import ( + UpdateRepositoryManager, +) from galaxy.tool_util.data import ToolDataTableManager as BaseToolDataTableManager from galaxy.tool_util.deps import containers from galaxy.tool_util.deps.dependencies import AppInfo @@ -250,7 +258,11 @@ def before_send(event, hint): # "cannot find 'file_name' while searching for 'species_chromosomes.file_name'"] # If we don't do this issues are never properly grouped since by default the calling stack is inspected, # and that is always unique in cheetah as it is dynamically generated. - event["fingerprint"] = [str(exc_value), str(exc_value.tool_version), str(exc_value.__cause__)] + event["fingerprint"] = [ + str(exc_value), + str(exc_value.tool_version), + str(exc_value.__cause__), + ] event.setdefault("tags", {}).update( { "tool_is_latest": exc_value.is_latest, @@ -307,7 +319,10 @@ def __init__(self, fsmon=False, **kwargs) -> None: config_file = kwargs.get("global_conf", {}).get("__file__", None) if config_file: log.debug('Using "galaxy.ini" config file: %s', config_file) - self._configure_models(check_migrate_databases=self.config.check_migrate_databases, config_file=config_file) + self._configure_models( + check_migrate_databases=self.config.check_migrate_databases, + config_file=config_file, + ) # Security helper self._configure_security() self._register_singleton(IdEncodingHelper, self.security) @@ -411,7 +426,11 @@ def _configure_toolbox(self): index_help = getattr(self.config, "index_tool_help", True) self.toolbox_search = self._register_singleton( ToolBoxSearch, - ToolBoxSearch(self.toolbox, index_dir=self.config.tool_search_index_dir, index_help=index_help), + ToolBoxSearch( + self.toolbox, + index_dir=self.config.tool_search_index_dir, + index_help=index_help, + ), ) @property @@ -496,8 +515,9 @@ def _configure_object_store(self, **kwds): templates = ConfiguredObjectStoreTemplates.from_app_config(self.config, vault_configured=vault_configured) self.object_store_templates = self._register_singleton(ConfiguredObjectStoreTemplates, templates) user_object_store_resolver = self._register_abstract_singleton( - UserObjectStoreResolver, UserObjectStoreResolverImpl # type: ignore[type-abstract] - ) # Ignored because of https://github.com/python/mypy/issues/4717 + UserObjectStoreResolver, # type: ignore[type-abstract] # https://github.com/python/mypy/issues/4717 + UserObjectStoreResolverImpl, + ) kwds["user_object_store_resolver"] = user_object_store_resolver self.object_store = build_object_store_from_config(self.config, **kwds) @@ -600,7 +620,13 @@ class GalaxyManagerApplication(MinimalManagerApp, MinimalGalaxyApplication): model: GalaxyModelMapping - def __init__(self, configure_logging=True, use_converters=True, use_display_applications=True, **kwargs): + def __init__( + self, + configure_logging=True, + use_converters=True, + use_display_applications=True, + **kwargs, + ): super().__init__(**kwargs) self._register_singleton(MinimalManagerApp, self) # type: ignore[type-abstract] self.execution_timer_factory = self._register_singleton( @@ -617,7 +643,8 @@ def __init__(self, configure_logging=True, use_converters=True, use_display_appl # Initialize job metrics manager, needs to be in place before # config so per-destination modifications can be made. self.job_metrics = self._register_singleton( - JobMetrics, JobMetrics(self.config.job_metrics_config_file, self.config.job_metrics, app=self) + JobMetrics, + JobMetrics(self.config.job_metrics_config_file, self.config.job_metrics, app=self), ) # Initialize the job management configuration self.job_config = self._register_singleton(jobs.JobConfiguration) @@ -655,11 +682,15 @@ def __init__(self, configure_logging=True, use_converters=True, use_display_appl self.role_manager = self._register_singleton(RoleManager) self.job_manager = self._register_singleton(JobManager) + # SSE dispatcher must be registered before NotificationManager so Lagom + # can auto-inject the Optional[SSEEventDispatcher] constructor arg. + self._register_singleton(SSEEventDispatcher, SSEEventDispatcher(self)) self.notification_manager = self._register_singleton(NotificationManager) self.interactivetool_manager = InteractiveToolManager(self) self.task_manager = self._register_abstract_singleton( - AsyncTasksManager, CeleryAsyncTasksManager # type: ignore[type-abstract] # https://github.com/python/mypy/issues/4717 + AsyncTasksManager, # type: ignore[type-abstract] # https://github.com/python/mypy/issues/4717 + CeleryAsyncTasksManager, ) # ConfiguredFileSources @@ -671,10 +702,12 @@ def __init__(self, configure_logging=True, use_converters=True, use_display_appl self._register_singleton(FileSourcePluginLoader, file_source_plugin_loader) self.file_source_templates = self._register_singleton(ConfiguredFileSourceTemplates, templates) self._register_singleton( - UserDefinedFileSourcesConfig, UserDefinedFileSourcesConfig.from_app_config(self.config) + UserDefinedFileSourcesConfig, + UserDefinedFileSourcesConfig.from_app_config(self.config), ) user_defined_file_sources = self._register_abstract_singleton( - UserDefinedFileSources, UserDefinedFileSourcesImpl # type: ignore[type-abstract] # https://github.com/python/mypy/issues/4717 + UserDefinedFileSources, # type: ignore[type-abstract] # https://github.com/python/mypy/issues/4717 + UserDefinedFileSourcesImpl, ) configured_file_source_conf: ConfiguredFileSourcesConf = ConfiguredFileSourcesConf.from_app_config(self.config) file_sources = ConfiguredFileSources( @@ -690,7 +723,8 @@ def __init__(self, configure_logging=True, use_converters=True, use_display_appl # Load security policy. self.security_agent = self.model.security_agent self.host_security_agent = galaxy.model.security.HostAgent( - self.security_agent.sa_session, permitted_actions=self.security_agent.permitted_actions + self.security_agent.sa_session, + permitted_actions=self.security_agent.permitted_actions, ) # We need the datatype registry for running certain tasks that modify HDAs, and to build the registry we need @@ -791,6 +825,7 @@ def __init__(self, **kwargs) -> None: ("queue worker", self._shutdown_queue_worker), ("file watcher", self._shutdown_watcher), ("database heartbeat", self._shutdown_database_heartbeat), + ("history audit monitor", self._shutdown_history_audit_monitor), ("workflow scheduler", self._shutdown_scheduling_manager), ("object store", self._shutdown_object_store), ("job manager", self._shutdown_job_manager), @@ -809,17 +844,23 @@ def __init__(self, **kwargs) -> None: # queue_worker *can* be initialized with a queue, but here we don't # want to and we'll allow postfork to bind and start it. self.queue_worker = self._register_singleton(GalaxyQueueWorker, GalaxyQueueWorker(self)) + # SSE connection manager for real-time notification push + self.sse_connection_manager = self._register_singleton(SSEConnectionManager) # AI agent registry and service agent_registry = build_agent_registry(self.config) self._register_singleton(AgentRegistry, agent_registry) - self._register_singleton(AgentService, AgentService(self.config, JobQueryManager(self), agent_registry)) + self._register_singleton( + AgentService, + AgentService(self.config, JobQueryManager(self), agent_registry), + ) self.dependency_resolvers_view = self._register_singleton( DependencyResolversView, DependencyResolversView(self) ) self.test_data_resolver = self._register_singleton( - TestDataResolver, TestDataResolver(file_dirs=self.config.tool_test_data_directories) + TestDataResolver, + TestDataResolver(file_dirs=self.config.tool_test_data_directories), ) self.api_keys_manager = self._register_singleton(ApiKeyManager) @@ -876,7 +917,9 @@ def __init__(self, **kwargs) -> None: # Start the heartbeat process if configured and available if self.config.use_heartbeat: self.heartbeat = heartbeat.Heartbeat( - self.config, period=self.config.heartbeat_interval, fname=self.config.heartbeat_log + self.config, + period=self.config.heartbeat_interval, + fname=self.config.heartbeat_log, ) self.heartbeat.daemon = True self.application_stack.register_postfork_function(self.heartbeat.start) @@ -886,7 +929,9 @@ def __init__(self, **kwargs) -> None: from galaxy.authnz import managers self.authnz_manager = managers.AuthnzManager( - self, self.config.oidc_config_file, self.config.oidc_backends_config_file + self, + self.config.oidc_config_file, + self.config.oidc_backends_config_file, ) # If there is only a single external authentication provider in use @@ -923,7 +968,12 @@ def __init__(self, **kwargs) -> None: self.workflow_completion_hook_registry, ) self.application_stack.register_postfork_function(self.workflow_completion_monitor.start) - self.haltables.append(("WorkflowCompletionMonitor", self.workflow_completion_monitor.shutdown_monitor)) + self.haltables.append( + ( + "WorkflowCompletionMonitor", + self.workflow_completion_monitor.shutdown_monitor, + ) + ) # Start the job manager self.application_stack.register_postfork_function(self.job_manager.start) @@ -942,12 +992,20 @@ def __init__(self, **kwargs) -> None: self.database_heartbeat.add_change_callback(self.watchers.change_state) self.application_stack.register_postfork_function(self.database_heartbeat.start) + # History audit monitor for SSE-based history updates + if self.config.enable_sse_history_updates: + from galaxy.managers.history_audit_monitor import HistoryAuditMonitor + + self._history_audit_monitor = self._register_singleton(HistoryAuditMonitor) + self.application_stack.register_postfork_function(self._history_audit_monitor.start) + # Start web stack message handling self.application_stack.register_postfork_function(self.application_stack.start) self.application_stack.register_postfork_function(self.queue_worker.bind_and_start) # Reload toolbox to pick up changes to toolbox made after master was ready self.application_stack.register_postfork_function( - lambda: reload_toolbox(self, save_integrated_tool_panel=False), post_fork_only=True + lambda: reload_toolbox(self, save_integrated_tool_panel=False), + post_fork_only=True, ) # Delay toolbox index until after startup self.application_stack.register_postfork_function( @@ -975,6 +1033,11 @@ def _shutdown_watcher(self): def _shutdown_database_heartbeat(self): self.database_heartbeat.shutdown() + def _shutdown_history_audit_monitor(self): + monitor = getattr(self, "_history_audit_monitor", None) + if monitor: + monitor.shutdown() + def _shutdown_scheduling_manager(self): self.workflow_scheduling_manager.shutdown() diff --git a/lib/galaxy/config/sample/galaxy.yml.sample b/lib/galaxy/config/sample/galaxy.yml.sample index 524ad8dcf774..34ae86e4b1bd 100644 --- a/lib/galaxy/config/sample/galaxy.yml.sample +++ b/lib/galaxy/config/sample/galaxy.yml.sample @@ -3129,6 +3129,20 @@ galaxy: # Message to display on the export citations tool page #citations_export_message_html: When writing up your analysis, remember to include all references that should be cited in order to completely describe your work. Also, please remember to cite Galaxy. + # Enables real-time history update notifications via Server-Sent + # Events (SSE). When enabled, a background monitor watches for history + # changes (via PostgreSQL LISTEN/NOTIFY or audit table polling as a + # fallback for SQLite) and pushes update signals to connected + # browsers, replacing aggressive 3-second polling. + #enable_sse_history_updates: false + + # The interval in seconds between history audit table polls when using + # the polling fallback (SQLite or when PostgreSQL LISTEN/NOTIFY is + # unavailable). Only used when enable_sse_history_updates is true. + # Lower values mean faster updates but more database queries. + # Recommended range: 1-5 seconds. + #history_audit_monitor_poll_interval: 2 + # Enables the Notification System integrated in Galaxy. # Users can receive automatic notifications when a certain resource is # shared with them or when some long running operations have finished, diff --git a/lib/galaxy/config/schemas/config_schema.yml b/lib/galaxy/config/schemas/config_schema.yml index 05c50b6eb390..aba88b87cee4 100644 --- a/lib/galaxy/config/schemas/config_schema.yml +++ b/lib/galaxy/config/schemas/config_schema.yml @@ -4293,6 +4293,26 @@ mapping: desc: | Message to display on the export citations tool page + enable_sse_history_updates: + type: bool + default: false + required: false + desc: | + Enables real-time history update notifications via Server-Sent Events (SSE). + When enabled, a background monitor watches for history changes (via PostgreSQL + LISTEN/NOTIFY or audit table polling as a fallback for SQLite) and pushes update + signals to connected browsers, replacing aggressive 3-second polling. + + history_audit_monitor_poll_interval: + type: int + default: 2 + required: false + desc: | + The interval in seconds between history audit table polls when using the polling + fallback (SQLite or when PostgreSQL LISTEN/NOTIFY is unavailable). Only used when + enable_sse_history_updates is true. Lower values mean faster updates but more + database queries. Recommended range: 1-5 seconds. + enable_notification_system: type: bool default: false diff --git a/lib/galaxy/managers/history_audit_monitor.py b/lib/galaxy/managers/history_audit_monitor.py new file mode 100644 index 000000000000..4a9759bd8375 --- /dev/null +++ b/lib/galaxy/managers/history_audit_monitor.py @@ -0,0 +1,263 @@ +"""Monitor for history audit table changes. + +Detects history changes via PostgreSQL LISTEN/NOTIFY (instant) or by polling +the history_audit table (SQLite fallback). Dispatches SSE events to connected +users via Kombu control tasks. + +Only active when ``enable_sse_history_updates`` is True in the Galaxy config. +""" + +import logging +import select +import threading +import time +from collections import ( + defaultdict, + OrderedDict, +) +from datetime import ( + datetime, + timedelta, +) +from typing import ( + Any, + Iterator, + Optional, +) + +from sqlalchemy import select as sa_select +from sqlalchemy.engine import Engine + +from galaxy.config import GalaxyAppConfiguration +from galaxy.managers.sse import SSEEventDispatcher +from galaxy.model import ( + History, + HistoryAudit, +) +from galaxy.model.mapping import GalaxyModelMapping + +log = logging.getLogger(__name__) + +CHANNEL_NAME = "galaxy_history_update" +OWNER_CACHE_MAX = 10_000 +DEBOUNCE_SECONDS = 0.2 + + +class _PgListenAdapter: + """Thin DBAPI-level adapter for PostgreSQL LISTEN/NOTIFY. + + Hides the receiving-API differences between psycopg2 (``conn.poll()`` + + ``conn.notifies`` list, driven by ``select.select``) and psycopg3 + (``conn.notifies(timeout=...)`` generator). The SA URL is used to inherit + DSN / SSL / auth config, but the connection itself is opened directly with + the DBAPI driver so it stays outside the SA pool — LISTEN connections must + live for the lifetime of the monitor and never be returned to the pool. + """ + + def __init__(self, engine: Engine) -> None: + # Strip the SA ``+driver`` suffix so the raw DBAPI libraries accept the URL. + dsn = engine.url.set(drivername="postgresql").render_as_string(hide_password=False) + driver = engine.dialect.driver + if driver == "psycopg": + import psycopg # conditional: psycopg3 driver + + self._conn: Any = psycopg.connect(dsn, autocommit=True) + self.driver = "psycopg3" + else: + import psycopg2 # conditional: psycopg2 driver + + self._conn = psycopg2.connect(dsn) + self._conn.autocommit = True # same effect as set_isolation_level(AUTOCOMMIT) + self.driver = "psycopg2" + + def listen(self, channel: str) -> None: + with self._conn.cursor() as cursor: + cursor.execute(f"LISTEN {channel};") + + def close(self) -> None: + try: + self._conn.close() + except Exception: + log.debug("Error closing LISTEN connection", exc_info=True) + + def poll(self, timeout: float) -> Iterator[str]: + """Block up to ``timeout`` seconds and yield notification payloads. + + Returns an empty iterator on timeout so callers can uniformly treat + "nothing received in this tick" regardless of driver. + """ + if self.driver == "psycopg3": + # psycopg3: notifies() is a blocking generator bounded by ``timeout``. + yield from (n.payload for n in self._conn.notifies(timeout=timeout)) + return + # psycopg2: block on the socket via select(), then drain notifies list. + if select.select([self._conn], [], [], timeout) == ([], [], []): + return + self._conn.poll() + while self._conn.notifies: + yield self._conn.notifies.pop(0).payload + + +class HistoryAuditMonitor: + """Background thread that monitors history_audit for changes and dispatches SSE events. + + On PostgreSQL: uses LISTEN/NOTIFY for instant notification. + On SQLite: polls history_audit table at a configurable interval. + """ + + def __init__( + self, + config: GalaxyAppConfiguration, + model: GalaxyModelMapping, + sse_dispatcher: SSEEventDispatcher, + ) -> None: + self._config = config + self._model = model + self._dispatcher = sse_dispatcher + self.poll_interval: int = config.history_audit_monitor_poll_interval + self._is_postgres: bool = "postgres" in model.engine.name + self._exit = threading.Event() + self._thread: Optional[threading.Thread] = None + self._active = False + # Bounded LRU cache: history_id -> user_id, refreshed on miss. + self._history_owner_cache: "OrderedDict[int, int]" = OrderedDict() + + def start(self) -> None: + if self._active: + return + self._active = True + target = self._listen_postgres if self._is_postgres else self._poll_audit_table + self._thread = threading.Thread( + target=target, + name="history_audit_monitor", + daemon=True, + ) + self._thread.start() + log.info( + "HistoryAuditMonitor started (mode=%s, interval=%ds)", + "pg_listen" if self._is_postgres else "poll", + self.poll_interval, + ) + + def shutdown(self) -> None: + self._active = False + self._exit.set() + if self._thread: + self._thread.join(timeout=5) + + # --- PostgreSQL LISTEN/NOTIFY mode --- + + def _listen_postgres(self) -> None: + """LISTEN for history update notifications. + + Works against both psycopg2 and psycopg3 — whichever driver the SA + engine was built with. Falls back to the SQLite polling path if the + DBAPI driver can't be imported or the initial LISTEN fails. + """ + try: + adapter = _PgListenAdapter(self._model.engine) + adapter.listen(CHANNEL_NAME) + except Exception: + log.warning( + "Failed to establish PostgreSQL LISTEN connection, falling back to polling", + exc_info=True, + ) + self._poll_audit_table() + return + + log.debug("LISTEN %s established (driver=%s)", CHANNEL_NAME, adapter.driver) + pending: dict[int, float] = {} # history_id -> first_seen_time + + try: + while not self._exit.is_set(): + received_any = False + for payload in adapter.poll(self.poll_interval): + received_any = True + try: + history_id = int(payload) + except (ValueError, TypeError): + continue + pending.setdefault(history_id, time.monotonic()) + + if not received_any: + # Timeout — flush anything that's been pending since last tick + if pending: + self._dispatch_history_updates(set(pending.keys())) + pending.clear() + continue + + # Debounce: dispatch events that have been pending long enough + now = time.monotonic() + ready = {hid for hid, ts in pending.items() if now - ts >= DEBOUNCE_SECONDS} + if ready: + self._dispatch_history_updates(ready) + for hid in ready: + del pending[hid] + except Exception: + log.exception("HistoryAuditMonitor LISTEN loop error") + finally: + adapter.close() + + # --- SQLite polling fallback --- + + def _poll_audit_table(self) -> None: + """Poll history_audit for recent changes.""" + last_check = datetime.utcnow() - timedelta(seconds=self.poll_interval) + + while not self._exit.is_set(): + try: + check_time = datetime.utcnow() + stmt = ( + sa_select(HistoryAudit.history_id) + .where(HistoryAudit.update_time > last_check) + .group_by(HistoryAudit.history_id) + ) + with self._model.new_session() as session: + changed_ids = set(session.scalars(stmt).all()) + + if changed_ids: + self._dispatch_history_updates(changed_ids) + + last_check = check_time + except Exception: + log.exception("HistoryAuditMonitor poll error") + + self._exit.wait(self.poll_interval) + + # --- Common dispatch logic --- + + def _dispatch_history_updates(self, history_ids: set[int]) -> None: + """Map history_ids to user_ids and send Kombu control task. + + Raw integer history IDs are sent across the control queue; encoding is + deferred to the ``history_update`` task handler on the receiving side, + keeping this manager free of presentation concerns. + """ + # Resolve owners for unknown history_ids + unknown = history_ids - self._history_owner_cache.keys() + if unknown: + self._refresh_owner_cache(unknown) + + user_updates: dict[str, list[int]] = defaultdict(list) + for history_id in history_ids: + user_id = self._history_owner_cache.get(history_id) + if user_id is not None: + user_updates[str(user_id)].append(history_id) + + if not user_updates: + return + + self._dispatcher.history_update(user_updates=dict(user_updates)) + + def _refresh_owner_cache(self, history_ids: set[int]) -> None: + """Look up user_id for given history_ids and update the bounded cache.""" + try: + stmt = sa_select(History.id, History.user_id).where(History.id.in_(history_ids)) + with self._model.new_session() as session: + for row in session.execute(stmt): + self._history_owner_cache[row[0]] = row[1] + self._history_owner_cache.move_to_end(row[0]) + while len(self._history_owner_cache) > OWNER_CACHE_MAX: + self._history_owner_cache.popitem(last=False) + except Exception: + log.debug("Failed to refresh history owner cache", exc_info=True) diff --git a/lib/galaxy/managers/notification.py b/lib/galaxy/managers/notification.py index f2c654ca6f97..d82140549b52 100644 --- a/lib/galaxy/managers/notification.py +++ b/lib/galaxy/managers/notification.py @@ -37,6 +37,7 @@ ObjectNotFound, ) from galaxy.managers.markdown_util import to_html +from galaxy.managers.sse import SSEEventDispatcher from galaxy.model import ( GroupRoleAssociation, Notification, @@ -49,6 +50,7 @@ from galaxy.schema.notifications import ( AnyNotificationContent, BroadcastNotificationCreateRequest, + BroadcastNotificationResponse, MandatoryNotificationCategory, MessageNotificationContent, NewSharedItemNotificationContent, @@ -58,6 +60,7 @@ NotificationCreateData, NotificationCreateRequest, NotificationRecipients, + NotificationResponse, NotificationVariant, PersonalNotificationCategory, UpdateUserNotificationPreferencesRequest, @@ -94,9 +97,15 @@ def send(self, notification: Notification, user: User): class NotificationManager: """Manager class to interact with the database models related with Notifications.""" - def __init__(self, sa_session: galaxy_scoped_session, config: GalaxyAppConfiguration): + def __init__( + self, + sa_session: galaxy_scoped_session, + config: GalaxyAppConfiguration, + sse_dispatcher: Optional[SSEEventDispatcher] = None, + ): self.sa_session = sa_session self.config = config + self.sse_dispatcher = sse_dispatcher self.recipient_resolver = NotificationRecipientResolver(strategy=DefaultStrategy(sa_session)) self.user_notification_columns: list[InstrumentedAttribute] = [ Notification.id, @@ -164,6 +173,10 @@ def send_notification_to_recipients(self, request: NotificationCreateRequest) -> notifications_sent = self._create_associations(notification, recipient_users) self.sa_session.commit() + # Push SSE events to connected users via control queue + user_ids = [user.id for user in recipient_users] + self._notify_users_via_sse(user_ids, notification) + return notification, notifications_sent def _create_associations(self, notification: Notification, users: list[User]) -> int: @@ -179,6 +192,26 @@ def _create_associations(self, notification: Notification, users: list[User]) -> continue return success_count + def _notify_users_via_sse(self, user_ids: list[int], notification: Notification) -> None: + """Broadcast a control task to all workers to push SSE events to connected users.""" + if not self.sse_dispatcher or not user_ids: + return + try: + payload = NotificationResponse.model_validate(notification).model_dump_json() + self.sse_dispatcher.notify_users(user_ids, payload) + except Exception: + log.warning("Failed to send SSE notification event", exc_info=True) + + def _notify_broadcast_via_sse(self, notification: Notification) -> None: + """Broadcast a control task to all workers to push SSE broadcast events.""" + if not self.sse_dispatcher: + return + try: + payload = BroadcastNotificationResponse.model_validate(notification).model_dump_json() + self.sse_dispatcher.notify_broadcast(payload) + except Exception: + log.warning("Failed to send SSE broadcast event", exc_info=True) + def dispatch_pending_notifications_via_channels(self) -> int: """ Dispatches all pending notifications to the users depending on the configured channels. @@ -273,6 +306,7 @@ def create_broadcast_notification(self, request: BroadcastNotificationCreateRequ notification = self._create_notification_model(request) self.sa_session.add(notification) self.sa_session.commit() + self._notify_broadcast_via_sse(notification) return notification def get_user_notification(self, user: User, notification_id: int, active_only: Optional[bool] = True): @@ -353,7 +387,10 @@ def get_all_broadcasted_notifications(self, since: Optional[datetime] = None, ac return result def update_user_notifications( - self, user: User, notification_ids: set[int], request: UserNotificationUpdateRequest + self, + user: User, + notification_ids: set[int], + request: UserNotificationUpdateRequest, ) -> int: """Updates a batch of notifications associated with the user using the requested values.""" updated_row_count = 0 @@ -447,7 +484,8 @@ def cleanup_expired_notifications(self) -> CleanupResultSummary: UserNotificationAssociation.notification_id.in_(expired_notifications_stmt) ) result = cast( - CursorResult, self.sa_session.execute(delete_stmt, execution_options={"synchronize_session": False}) + CursorResult, + self.sa_session.execute(delete_stmt, execution_options={"synchronize_session": False}), ) deleted_associations_count = result.rowcount @@ -474,7 +512,10 @@ def _create_notification_model( return notification def _user_notifications_query( - self, user: User, since: Optional[datetime] = None, active_only: Optional[bool] = True + self, + user: User, + since: Optional[datetime] = None, + active_only: Optional[bool] = True, ): stmt = ( select(*self.user_notification_columns) @@ -552,7 +593,7 @@ def resolve_users(self, recipients: NotificationRecipients) -> list[User]: user_ids_from_roles_stmt = self._get_all_user_ids_from_roles_query(all_role_ids) union_stmt = union(user_ids_from_groups_stmt, user_ids_from_roles_stmt) - user_ids_from_groups_and_roles = {id for id, in self.sa_session.execute(union_stmt)} + user_ids_from_groups_and_roles = {id for (id,) in self.sa_session.execute(union_stmt)} unique_user_ids.update(user_ids_from_groups_and_roles) stmt = select(User).where(User.id.in_(unique_user_ids)) @@ -591,7 +632,7 @@ def _expand_group_and_roles_ids(self, group_ids: set[int], role_ids: set[int]) - .where(GroupRoleAssociation.role_id.in_(role_ids)) .distinct() ) - group_ids_from_roles = {id for id, in self.sa_session.execute(stmt) if id is not None} + group_ids_from_roles = {id for (id,) in self.sa_session.execute(stmt) if id is not None} new_group_ids = group_ids_from_roles - processed_group_ids # Get role IDs associated with any of the given group IDs @@ -601,7 +642,7 @@ def _expand_group_and_roles_ids(self, group_ids: set[int], role_ids: set[int]) - .where(GroupRoleAssociation.group_id.in_(group_ids)) .distinct() ) - role_ids_from_groups = {id for id, in self.sa_session.execute(stmt) if id is not None} + role_ids_from_groups = {id for (id,) in self.sa_session.execute(stmt) if id is not None} new_role_ids = role_ids_from_groups - processed_role_ids # Stop if there are no new group or role IDs to process @@ -713,7 +754,6 @@ def get_body(self, template_format: TemplateFormats) -> str: class MessageEmailNotificationTemplateBuilder(EmailNotificationTemplateBuilder): - markdown_to = { TemplateFormats.HTML: to_html, TemplateFormats.TXT: lambda x: x, # TODO: strip markdown? @@ -730,9 +770,10 @@ def get_subject(self) -> str: class NewSharedItemEmailNotificationTemplateBuilder(EmailNotificationTemplateBuilder): - def get_content(self, template_format: TemplateFormats) -> AnyNotificationContent: - content = NewSharedItemNotificationContent.model_construct(**self.notification.content) # type: ignore[arg-type] + content = NewSharedItemNotificationContent.model_construct( + **self.notification.content + ) # type: ignore[arg-type] return content def get_subject(self) -> str: @@ -741,7 +782,6 @@ def get_subject(self) -> str: class EmailNotificationChannelPlugin(NotificationChannelPlugin): - # Register the supported email templates here email_templates_by_category: dict[PersonalNotificationCategory, type[EmailNotificationTemplateBuilder]] = { PersonalNotificationCategory.message: MessageEmailNotificationTemplateBuilder, diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py new file mode 100644 index 000000000000..d63532294f9e --- /dev/null +++ b/lib/galaxy/managers/sse.py @@ -0,0 +1,227 @@ +"""Server-Sent Events (SSE) connection manager for real-time notifications. + +Manages per-worker in-memory mapping of user IDs to asyncio.Queue instances, +enabling push of events from any thread (e.g. Kombu control queue worker) +to async SSE endpoint handlers running in the uvicorn event loop. +""" + +import asyncio +import logging +from collections import defaultdict +from dataclasses import dataclass +from datetime import datetime +from typing import ( + AsyncIterator, + Optional, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from starlette.requests import Request + + from galaxy.structured_app import MinimalManagerApp + +log = logging.getLogger(__name__) + + +@dataclass +class SSEEvent: + """An event to be sent to an SSE client.""" + + event: str # e.g. "notification_update", "broadcast_update", "notification_status" + data: str # JSON payload + id: Optional[str] = None # ISO timestamp, used by EventSource as Last-Event-ID on reconnect + + def to_wire(self) -> str: + """Serialize this event to the SSE wire format (``event:…\\ndata:…\\n[id:…\\n]\\n``).""" + frame = f"event: {self.event}\ndata: {self.data}\n" + if self.id: + frame += f"id: {self.id}\n" + return frame + "\n" + + +class SSEConnectionManager: + """Per-worker manager for SSE connections. + + Maps user_ids to sets of asyncio.Queue instances. Each SSE connection + gets its own queue. The manager is thread-safe for push operations + via ``loop.call_soon_threadsafe``. + + Lifecycle: + - Instantiated once per Galaxy worker process (on app object). + - ``connect()`` is called from the SSE async endpoint (event loop thread). + - ``disconnect()`` is called from the SSE endpoint's ``finally`` block. + - ``push_to_user()`` / ``push_broadcast()`` are called from ANY thread + (typically the Kombu daemon thread via control task handlers). + """ + + def __init__(self) -> None: + self._connections: dict[int, set[asyncio.Queue]] = defaultdict(set) + self._broadcast_connections: set[asyncio.Queue] = set() + self._loop: Optional[asyncio.AbstractEventLoop] = None + + def _ensure_loop(self) -> None: + """Capture the running asyncio event loop. Must be called from async context.""" + if self._loop is None or self._loop.is_closed(): + self._loop = asyncio.get_running_loop() + + # -- Called from ASYNC context (uvicorn event loop thread) -- + + def connect(self, user_id: Optional[int]) -> asyncio.Queue: + """Register a new SSE connection. Returns a queue to await events from. + + Called from the SSE endpoint handler (async context). A ``ready`` event is + enqueued immediately so that clients (and tests) can synchronize on the + server-side subscription rather than the underlying socket open event. + """ + self._ensure_loop() + queue: asyncio.Queue = asyncio.Queue(maxsize=64) + if user_id is not None: + self._connections[user_id].add(queue) + self._broadcast_connections.add(queue) + queue.put_nowait(SSEEvent(event="ready", data="")) + log.debug( + "SSE connection opened for user_id=%s (total=%d)", + user_id, + len(self._broadcast_connections), + ) + return queue + + def disconnect(self, user_id: Optional[int], queue: asyncio.Queue) -> None: + """Unregister an SSE connection. + + Called from the SSE endpoint's ``finally`` block (async context). + """ + if user_id is not None: + self._connections[user_id].discard(queue) + if not self._connections[user_id]: + del self._connections[user_id] + self._broadcast_connections.discard(queue) + log.debug( + "SSE connection closed for user_id=%s (total=%d)", + user_id, + len(self._broadcast_connections), + ) + + # -- Called from ANY thread (Kombu thread or async) -- + + def push_to_user(self, user_id: int, event: SSEEvent) -> None: + """Thread-safe. Push an event to all SSE connections for a specific user.""" + for queue in list(self._connections.get(user_id, [])): + self._safe_put(queue, event) + + def push_broadcast(self, event: SSEEvent) -> None: + """Thread-safe. Push an event to ALL connected SSE clients.""" + for queue in list(self._broadcast_connections): + self._safe_put(queue, event) + + def _safe_put(self, queue: asyncio.Queue, event: SSEEvent) -> None: + """Cross the thread boundary safely using ``call_soon_threadsafe``.""" + if self._loop is None or self._loop.is_closed(): + return + try: + self._loop.call_soon_threadsafe(self._do_put, queue, event) + except RuntimeError: + # Event loop is closed or shutting down + pass + + @staticmethod + def _do_put(queue: asyncio.Queue, event: SSEEvent) -> None: + """Runs ON the event loop thread. Safe to touch asyncio.Queue here.""" + try: + queue.put_nowait(event) + except asyncio.QueueFull: + log.warning("SSE queue full, dropping event: %s", event.event) + + @property + def connected_user_ids(self) -> set[int]: + return set(self._connections.keys()) + + @property + def total_connections(self) -> int: + return len(self._broadcast_connections) + + # -- High-level streaming helper -- + + async def stream( + self, + request: "Request", + user_id: Optional[int], + catch_up: Optional[SSEEvent] = None, + keepalive: float = 30.0, + ) -> AsyncIterator[str]: + """Yield SSE-framed strings for one connected client. + + Handles ``connect``, optional catch-up event priming, the main event + loop with a keepalive comment on timeout, disconnect detection, and + ``disconnect`` in ``finally``. Controllers should call this and return + the iterator wrapped in a ``StreamingResponse``. + """ + queue = self.connect(user_id) + if catch_up is not None: + await queue.put(catch_up) + try: + while True: + if await request.is_disconnected(): + break + try: + event: SSEEvent = await asyncio.wait_for(queue.get(), timeout=keepalive) + yield event.to_wire() + except asyncio.TimeoutError: + yield ": keepalive\n\n" + finally: + self.disconnect(user_id, queue) + + +class SSEEventDispatcher: + """Fans out SSE events across all Galaxy worker processes via the control queue. + + This is a thin wrapper around ``send_control_task`` so that managers can + depend on a narrow, injectable collaborator instead of importing the + queue-worker module directly. + + In Celery / background-task processes the app object has no ``queue_worker`` + (it's only built in ``UniverseApplication``), so dispatch is silently a + no-op there — notifications created from Celery tasks will still be + delivered the next time a client polls, just not pushed in real time. + """ + + def __init__(self, app: "MinimalManagerApp") -> None: + self._app = app + + def _send(self, task: str, kwargs: dict) -> None: + if getattr(self._app, "queue_worker", None) is None: + # No control-queue publisher available (e.g. Celery worker context). + log.debug("SSE dispatch skipped: app has no queue_worker (task=%s)", task) + return + from galaxy.queue_worker import send_control_task # circular: queue_worker -> app -> managers + + send_control_task(self._app, task, kwargs=kwargs, expiration=10) + + def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str] = None) -> None: + self._send( + "notify_users", + { + "user_ids": user_ids, + "payload": payload, + "event_id": event_id or datetime.utcnow().isoformat(), + }, + ) + + def notify_broadcast(self, payload: str, event_id: Optional[str] = None) -> None: + self._send( + "notify_broadcast", + { + "payload": payload, + "event_id": event_id or datetime.utcnow().isoformat(), + }, + ) + + def history_update(self, user_updates: dict[str, list], event_id: Optional[str] = None) -> None: + self._send( + "history_update", + { + "user_updates": user_updates, + "event_id": event_id or datetime.utcnow().isoformat(), + }, + ) diff --git a/lib/galaxy/model/triggers/update_audit_table.py b/lib/galaxy/model/triggers/update_audit_table.py index 3c06c05830e0..9ce71e132a71 100644 --- a/lib/galaxy/model/triggers/update_audit_table.py +++ b/lib/galaxy/model/triggers/update_audit_table.py @@ -54,12 +54,18 @@ def statement_trigger_fn(id_field): RETURNS TRIGGER LANGUAGE 'plpgsql' AS $BODY$ + DECLARE + _history_id integer; BEGIN INSERT INTO history_audit (history_id, update_time) SELECT DISTINCT {id_field}, clock_timestamp() AT TIME ZONE 'UTC' FROM new_table WHERE {id_field} IS NOT NULL ON CONFLICT DO NOTHING; + FOR _history_id IN SELECT DISTINCT {id_field} FROM new_table WHERE {id_field} IS NOT NULL + LOOP + PERFORM pg_notify('galaxy_history_update', _history_id::text); + END LOOP; RETURN NULL; END; $BODY$ @@ -77,6 +83,7 @@ def row_trigger_fn(id_field): INSERT INTO history_audit (history_id, update_time) VALUES (NEW.{id_field}, clock_timestamp() AT TIME ZONE 'UTC') ON CONFLICT DO NOTHING; + PERFORM pg_notify('galaxy_history_update', NEW.{id_field}::text); RETURN NULL; END; $BODY$ diff --git a/lib/galaxy/queue_worker/__init__.py b/lib/galaxy/queue_worker/__init__.py index 0e3d0f719b04..0da4b289117a 100644 --- a/lib/galaxy/queue_worker/__init__.py +++ b/lib/galaxy/queue_worker/__init__.py @@ -4,6 +4,7 @@ """ import importlib +import json import logging import math import socket @@ -27,6 +28,10 @@ import galaxy.queues from galaxy import util from galaxy.config import reload_config_options +from galaxy.managers.sse import ( + SSEConnectionManager, + SSEEvent, +) from galaxy.model import User from galaxy.tools import ToolBox from galaxy.tools.data_manager.manager import DataManagers @@ -43,7 +48,12 @@ ) -def send_local_control_task(app: "StructuredApp", task: str, get_response: bool = False, kwargs: Optional[dict] = None): +def send_local_control_task( + app: "StructuredApp", + task: str, + get_response: bool = False, + kwargs: Optional[dict] = None, +): """ This sends a message to the process-local control worker, which is useful for one-time asynchronous tasks like recalculating user disk usage. @@ -57,7 +67,15 @@ def send_local_control_task(app: "StructuredApp", task: str, get_response: bool return control_task.send_task(payload, routing_key, local=True, get_response=get_response) -def send_control_task(app, task, noop_self=False, get_response=False, routing_key="control.*", kwargs=None): +def send_control_task( + app, + task, + noop_self=False, + get_response=False, + routing_key="control.*", + kwargs=None, + expiration: Optional[int] = None, +): """ This sends a control task out to all processes, useful for things like reloading a data table, which needs to happen individually in all @@ -65,6 +83,7 @@ def send_control_task(app, task, noop_self=False, get_response=False, routing_ke Set noop_self to True to not run task for current process. Set get_response to True to wait for and return the task results as a list. + Set expiration to a number of seconds for message TTL. """ if kwargs is None: kwargs = {} @@ -73,7 +92,9 @@ def send_control_task(app, task, noop_self=False, get_response=False, routing_ke if noop_self: payload["noop"] = app.config.server_name control_task = ControlTask(app.queue_worker) - return control_task.send_task(payload=payload, routing_key=routing_key, get_response=get_response) + return control_task.send_task( + payload=payload, routing_key=routing_key, get_response=get_response, expiration=expiration + ) class ControlTask: @@ -107,7 +128,15 @@ def on_response(self, message): if message.properties["correlation_id"] == self.correlation_id: self.response = message.payload["result"] - def send_task(self, payload, routing_key, local=False, get_response=False, timeout=10): + def send_task( + self, + payload, + routing_key, + local=False, + get_response=False, + timeout=10, + expiration: Optional[int] = None, + ): if local: declare_queues = self.control_queues else: @@ -129,14 +158,24 @@ def send_task(self, payload, routing_key, local=False, get_response=False, timeo correlation_id=self.correlation_id, retry=True, headers={"epoch": time.time()}, + expiration=expiration, ) if get_response: - with Consumer(self.connection, on_message=self.on_response, queues=callback_queue, no_ack=True): + with Consumer( + self.connection, + on_message=self.on_response, + queues=callback_queue, + no_ack=True, + ): while self.response is self._response: self.connection.drain_events(timeout=timeout) return self.response except TimeoutError: - log.exception("Error waiting for task: '%s' sent with routing key '%s'", payload, routing_key) + log.exception( + "Error waiting for task: '%s' sent with routing key '%s'", + payload, + routing_key, + ) except Exception: log.exception("Error queueing async task: '%s'. for %s", payload, routing_key) @@ -189,7 +228,10 @@ def _get_new_toolbox(app: "UniverseApplication", save_integrated_tool_panel: boo tool_configs = app.config.tool_configs new_toolbox = ToolBox( - tool_configs, app.config.tool_path, app, save_integrated_tool_panel=save_integrated_tool_panel + tool_configs, + app.config.tool_path, + app, + save_integrated_tool_panel=save_integrated_tool_panel, ) new_toolbox.data_manager_tools = app.toolbox.data_manager_tools app.datatypes_registry.load_datatype_converters(new_toolbox, use_cached=True) @@ -309,6 +351,44 @@ def admin_job_lock(app, **kwargs): log.info(f"Administrative Job Lock is now set to {job_lock}. Jobs will {'not' if job_lock else 'now'} dispatch.") +def notify_users(app, **kwargs): + """Push SSE events to connected users on this worker process.""" + sse_manager = app[SSEConnectionManager] + user_ids = kwargs.get("user_ids", []) + payload = kwargs.get("payload", "{}") + event_id = kwargs.get("event_id") + event = SSEEvent(event="notification_update", data=payload, id=event_id) + for user_id in user_ids: + sse_manager.push_to_user(user_id, event) + + +def notify_broadcast(app, **kwargs): + """Push SSE broadcast events to all connected clients on this worker process.""" + sse_manager = app[SSEConnectionManager] + payload = kwargs.get("payload", "{}") + event_id = kwargs.get("event_id") + event = SSEEvent(event="broadcast_update", data=payload, id=event_id) + sse_manager.push_broadcast(event) + + +def history_update(app, **kwargs): + """Push SSE history update events to connected users on this worker process. + + Encodes integer history IDs here (not in the monitor) so the manager layer + stays free of presentation/security concerns. + """ + sse_manager = app[SSEConnectionManager] + user_updates = kwargs.get("user_updates", {}) + event_id = kwargs.get("event_id") + encode = app.security.encode_id + for user_id_str, history_ids in user_updates.items(): + user_id = int(user_id_str) + encoded_ids = [encode(hid) for hid in history_ids] + data = json.dumps({"history_ids": encoded_ids}) + event = SSEEvent(event="history_update", data=data, id=event_id) + sse_manager.push_to_user(user_id, event) + + control_message_to_task = { "create_panel_section": create_panel_section, "reload_tool": reload_tool, @@ -324,6 +404,9 @@ def admin_job_lock(app, **kwargs): "reconfigure_watcher": reconfigure_watcher, "reload_tour": reload_tour, "reload_core_config": reload_core_config, + "notify_users": notify_users, + "notify_broadcast": notify_broadcast, + "history_update": history_update, } @@ -354,7 +437,14 @@ def __init__(self, app, task_mapping=None): self.control_queues = [] self.epoch = 0 - def send_control_task(self, task, noop_self=False, get_response=False, routing_key="control.*", kwargs=None): + def send_control_task( + self, + task, + noop_self=False, + get_response=False, + routing_key="control.*", + kwargs=None, + ): return send_control_task( app=self.app, task=task, @@ -374,7 +464,10 @@ def declare_queues(self): def bind_and_start(self): # This is post-forking, so we got the correct sever name - log.info("Binding and starting galaxy control worker for %s", self.app.config.server_name) + log.info( + "Binding and starting galaxy control worker for %s", + self.app.config.server_name, + ) self.exchange_queue, self.direct_queue = galaxy.queues.control_queues_from_config(self.app.config) self.control_queues = [self.exchange_queue, self.direct_queue] self.epoch = time.time() diff --git a/lib/galaxy/structured_app/__init__.py b/lib/galaxy/structured_app/__init__.py index f0fb04cb8945..75b53be2ff18 100644 --- a/lib/galaxy/structured_app/__init__.py +++ b/lib/galaxy/structured_app/__init__.py @@ -50,6 +50,7 @@ from galaxy.managers.histories import HistoryManager from galaxy.managers.interactivetool import InteractiveToolManager from galaxy.managers.jobs import JobSearch + from galaxy.managers.sse import SSEConnectionManager from galaxy.managers.tools import DynamicToolManager from galaxy.managers.users import UserManager from galaxy.managers.workflows import ( @@ -57,7 +58,9 @@ WorkflowsManager, ) from galaxy.tool_shed.galaxy_install.client import DataManagersInterface - from galaxy.tool_shed.galaxy_install.installed_repository_manager import InstalledRepositoryManager + from galaxy.tool_shed.galaxy_install.installed_repository_manager import ( + InstalledRepositoryManager, + ) from galaxy.tool_util.data import ToolDataTableManager from galaxy.tools import ToolBox from galaxy.tools.cache import ToolCache @@ -174,6 +177,7 @@ class StructuredApp(MinimalManagerApp): vault: Vault webhooks_registry: WebhooksRegistry queue_worker: Any # 'galaxy.queue_worker.GalaxyQueueWorker' + sse_connection_manager: "SSEConnectionManager" data_provider_registry: Any # 'galaxy.visualization.data_providers.registry.DataProviderRegistry' tool_cache: "ToolCache" tool_shed_repository_cache: Optional[ToolShedRepositoryCache] diff --git a/lib/galaxy/webapps/galaxy/api/events.py b/lib/galaxy/webapps/galaxy/api/events.py new file mode 100644 index 000000000000..ad9a3b2e9d9e --- /dev/null +++ b/lib/galaxy/webapps/galaxy/api/events.py @@ -0,0 +1,66 @@ +""" +API endpoint for Server-Sent Events (SSE) stream. + +Provides a unified event stream for all real-time push events (notifications, +history updates, etc.) independent of the notification system configuration. +""" + +import logging +from typing import Optional + +from fastapi import ( + Header, + Request, +) +from starlette.responses import StreamingResponse + +from galaxy.managers.context import ProvidesUserContext +from galaxy.managers.sse import SSEConnectionManager +from galaxy.webapps.galaxy.services.notifications import NotificationService +from . import ( + depends, + DependsOnTrans, + Router, +) + +log = logging.getLogger(__name__) + +router = Router(tags=["events"]) + + +@router.cbv +class FastAPIEvents: + sse_manager: SSEConnectionManager = depends(SSEConnectionManager) + notifications: NotificationService = depends(NotificationService) + + @router.get( + "/api/events/stream", + summary="Server-Sent Events stream for real-time updates.", + response_class=StreamingResponse, + ) + async def stream_events( + self, + request: Request, + trans: ProvidesUserContext = DependsOnTrans, + last_event_id: Optional[str] = Header(None, alias="Last-Event-ID"), + ): + """Opens a Server-Sent Events (SSE) connection that pushes real-time + updates for notifications, history changes, and other events. + + On reconnect, the browser sends the ``Last-Event-ID`` header automatically. + If the notification system is enabled, any notifications created since that + timestamp are delivered as a catch-up ``notification_status`` event. + + Anonymous users receive only broadcast events. + """ + user_id = trans.user.id if not trans.anonymous else None + catch_up = self.notifications.build_status_catchup(trans, last_event_id) + return StreamingResponse( + self.sse_manager.stream(request, user_id, catch_up=catch_up), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) diff --git a/lib/galaxy/webapps/galaxy/api/notifications.py b/lib/galaxy/webapps/galaxy/api/notifications.py index f1bfbcab88a3..ec490dc0dd1d 100644 --- a/lib/galaxy/webapps/galaxy/api/notifications.py +++ b/lib/galaxy/webapps/galaxy/api/notifications.py @@ -10,12 +10,16 @@ from fastapi import ( Body, + Header, Query, + Request, Response, status, ) +from starlette.responses import StreamingResponse from galaxy.managers.context import ProvidesUserContext +from galaxy.managers.sse import SSEConnectionManager from galaxy.schema.notifications import ( BroadcastNotificationCreateRequest, BroadcastNotificationListResponse, @@ -51,6 +55,39 @@ @router.cbv class FastAPINotifications: service: NotificationService = depends(NotificationService) + sse_manager: SSEConnectionManager = depends(SSEConnectionManager) + + @router.get( + "/api/notifications/stream", + summary="Server-Sent Events stream for real-time notification updates.", + response_class=StreamingResponse, + ) + async def stream_notifications( + self, + request: Request, + trans: ProvidesUserContext = DependsOnTrans, + last_event_id: Optional[str] = Header(None, alias="Last-Event-ID"), + ): + """Opens a Server-Sent Events (SSE) connection that pushes notification updates in real-time. + + On reconnect, the browser sends the ``Last-Event-ID`` header automatically. + Any notifications created since that timestamp are delivered as a catch-up + ``notification_status`` event before the stream begins. + + Anonymous users receive only broadcast events. + """ + self.service.notification_manager.ensure_notifications_enabled() + user_id = trans.user.id if not trans.anonymous else None + catch_up = self.service.build_status_catchup(trans, last_event_id) + return StreamingResponse( + self.sse_manager.stream(request, user_id, catch_up=catch_up), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) @router.get( "/api/notifications/status", diff --git a/lib/galaxy/webapps/galaxy/services/notifications.py b/lib/galaxy/webapps/galaxy/services/notifications.py index ae3e44947197..c11e0af8737f 100644 --- a/lib/galaxy/webapps/galaxy/services/notifications.py +++ b/lib/galaxy/webapps/galaxy/services/notifications.py @@ -14,6 +14,7 @@ ) from galaxy.managers.context import ProvidesUserContext from galaxy.managers.notification import NotificationManager +from galaxy.managers.sse import SSEEvent from galaxy.model import User from galaxy.schema.fields import Security from galaxy.schema.notifications import ( @@ -99,6 +100,28 @@ def broadcast( total_notifications_sent=1, notification=NotificationResponse.model_validate(notification) ) + def build_status_catchup( + self, user_context: ProvidesUserContext, last_event_id: Optional[str] + ) -> Optional[SSEEvent]: + """Build a ``notification_status`` SSE event covering everything since ``last_event_id``. + + Returns ``None`` when catch-up isn't possible (no ``Last-Event-ID``, + unparseable timestamp, or notifications disabled) so callers can simply + pass the result to ``SSEConnectionManager.stream`` without extra guards. + """ + if not last_event_id or not self.notification_manager.notifications_enabled: + return None + try: + since = datetime.fromisoformat(last_event_id) + except (ValueError, TypeError): + return None + catchup = self.get_notifications_status(user_context, since) + return SSEEvent( + event="notification_status", + data=catchup.model_dump_json(), + id=datetime.utcnow().isoformat(), + ) + def get_notifications_status(self, user_context: ProvidesUserContext, since: datetime) -> NotificationStatusSummary: """Returns the status of (unread or updated) notifications received by the user **since** a particular date and time. diff --git a/lib/galaxy_test/base/sse.py b/lib/galaxy_test/base/sse.py new file mode 100644 index 000000000000..47ee4172be4c --- /dev/null +++ b/lib/galaxy_test/base/sse.py @@ -0,0 +1,136 @@ +"""Shared helpers for SSE integration tests. + +The stream layer emits a ``ready`` event as the first frame on every connection +so tests can synchronize on the server-side subscription rather than the +underlying TCP socket. ``SSELineListener`` waits for that ``ready`` event before +``start()`` returns, and propagates listener-thread exceptions back to the main +thread instead of silently swallowing them. +""" + +import queue +import threading +from typing import ( + Optional, +) + +import requests + +from galaxy.util.wait import wait_on + +CONNECT_TIMEOUT = 15 +DEFAULT_WAIT_TIMEOUT = 15 + + +def parse_sse_events(raw: str) -> list[dict]: + """Parse raw SSE text into a list of event dicts with ``event``, ``data``, and ``id`` keys.""" + events: list[dict] = [] + current: dict[str, str] = {} + for line in raw.split("\n"): + if line.startswith(":"): + continue # comment / keepalive + if line == "": + if current: + events.append(current) + current = {} + continue + if ": " in line: + field, _, value = line.partition(": ") + else: + field, value = line.rstrip(":"), "" + if field in ("event", "data", "id"): + current[field] = value + if current: + events.append(current) + return events + + +class SSEListenerError(Exception): + """Wraps an exception raised inside the listener thread.""" + + +class SSELineListener: + """Runs an SSE connection on a background thread and collects raw chunks. + + ``start()`` blocks until the server-side ``ready`` event has been received, + guaranteeing that any event *posted after* ``start()`` returns will be seen + by this listener. Failures in the background thread are surfaced via + ``wait_for_event`` instead of silently timing out. + """ + + def __init__( + self, + url: str, + api_key: str, + headers: Optional[dict] = None, + timeout: int = 30, + ): + self.url = url + self.api_key = api_key + self.headers = headers or {} + self.timeout = timeout + self._collected: list[str] = [] + self._stop = threading.Event() + self._ready = threading.Event() + self._errors: "queue.Queue[BaseException]" = queue.Queue() + self._thread = threading.Thread(target=self._listen, daemon=True) + + def start(self) -> None: + self._thread.start() + wait_on( + lambda: True if self._ready.is_set() else None, + "SSE `ready` event", + timeout=CONNECT_TIMEOUT, + ) + self._raise_if_errored() + + def stop(self) -> None: + self._stop.set() + self._thread.join(timeout=5) + + def wait_for_event(self, event_type: str, timeout: int = DEFAULT_WAIT_TIMEOUT) -> list[dict]: + """Block until at least one event of ``event_type`` has been observed, then return all such events.""" + + def _check(): + self._raise_if_errored() + events = self.get_events(event_type) + return events if events else None + + return wait_on(_check, f"SSE {event_type} event", timeout=timeout) + + def get_events(self, event_type: Optional[str] = None) -> list[dict]: + """Return all collected events so far, optionally filtered by type.""" + all_events = parse_sse_events("".join(self._collected)) + if event_type is None: + return all_events + return [e for e in all_events if e.get("event") == event_type] + + def _raise_if_errored(self) -> None: + try: + err = self._errors.get_nowait() + except queue.Empty: + return + raise SSEListenerError(f"SSE listener thread failed: {err!r}") from err + + def _listen(self) -> None: + try: + resp = requests.get( + self.url, + params={"key": self.api_key}, + headers=self.headers, + stream=True, + timeout=self.timeout, + ) + if resp.status_code != 200: + raise RuntimeError(f"SSE connect returned HTTP {resp.status_code}: {resp.text[:200]}") + for chunk in resp.iter_content(chunk_size=None, decode_unicode=True): + if chunk: + self._collected.append(chunk) + if not self._ready.is_set() and "event: ready" in "".join(self._collected): + self._ready.set() + if self._stop.is_set(): + break + resp.close() + except BaseException as exc: + self._errors.put(exc) + # Ensure start() doesn't hang forever on connection failure. + self._ready.set() diff --git a/test/integration/test_history_sse.py b/test/integration/test_history_sse.py new file mode 100644 index 000000000000..6dd15c7afea1 --- /dev/null +++ b/test/integration/test_history_sse.py @@ -0,0 +1,131 @@ +"""Integration tests for SSE-based history update notifications.""" + +import json +from urllib.parse import urljoin +from uuid import uuid4 + +import requests + +from galaxy_test.base.populators import DatasetPopulator +from galaxy_test.base.sse import SSELineListener +from galaxy_test.driver.integration_util import IntegrationTestCase + + +class TestHistorySSEIntegration(IntegrationTestCase): + dataset_populator: DatasetPopulator + framework_tool_and_types = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + super().handle_galaxy_config_kwds(config) + config["enable_celery_tasks"] = False + config["enable_sse_history_updates"] = True + + def setUp(self): + super().setUp() + self.dataset_populator = DatasetPopulator(self.galaxy_interactor) + + def _events_stream_url(self) -> str: + return urljoin(self.url, "api/events/stream") + + def _create_history(self, name=None) -> str: + name = name or f"test_history_{uuid4()}" + response = self._post("histories", data={"name": name}, json=True) + self._assert_status_code_is_ok(response) + return response.json()["id"] + + def test_sse_events_endpoint_returns_event_stream(self): + """The /api/events/stream endpoint should return content-type text/event-stream.""" + response = requests.get( + self._events_stream_url(), + params={"key": self.galaxy_interactor.api_key}, + stream=True, + timeout=5, + ) + assert response.status_code == 200 + assert "text/event-stream" in response.headers.get("content-type", "") + response.close() + + def test_sse_receives_history_update_on_dataset_upload(self): + """When a dataset is uploaded, a history_update SSE event should be received.""" + history_id = self._create_history() + + listener = SSELineListener(self._events_stream_url(), self.galaxy_interactor.api_key) + listener.start() + try: + self.dataset_populator.new_dataset(history_id, wait=False) + history_events = listener.wait_for_event("history_update") + assert len(history_events) > 0 + finally: + listener.stop() + + def test_history_update_contains_current_history_id(self): + """The history_update event should contain the history's encoded ID.""" + history_id = self._create_history() + + listener = SSELineListener(self._events_stream_url(), self.galaxy_interactor.api_key) + listener.start() + try: + self.dataset_populator.new_dataset(history_id, wait=False) + history_events = listener.wait_for_event("history_update") + found = any(history_id in json.loads(e["data"]).get("history_ids", []) for e in history_events) + assert found, f"Expected history_id '{history_id}' in history_update events, got: {history_events}" + finally: + listener.stop() + + def test_history_update_is_scoped_to_owning_user(self): + """User A must only see history_update events for their own histories. + + Inverted positive assertion: after user B's upload, user A uploads to + their own history and we assert A's stream contains A's encoded id and + not B's. This avoids a sleep-based "no events" test that was prone to + flaking under slow CI. + """ + user_b = self._setup_user(f"{uuid4()}@galaxy.test") + _, user_b_api_key = self._setup_user_get_key(user_b["email"]) + + user_a_history_id = self._create_history() + + listener = SSELineListener(self._events_stream_url(), self.galaxy_interactor.api_key) + listener.start() + try: + # User B creates a history and uploads to it. User A must NOT see this. + create_resp = requests.post( + urljoin(self.url, "api/histories"), + params={"key": user_b_api_key}, + json={"name": "User B History"}, + ) + assert create_resp.status_code == 200 + user_b_history_id = create_resp.json()["id"] + + requests.post( + urljoin(self.url, f"api/histories/{user_b_history_id}/contents"), + params={"key": user_b_api_key}, + json={"from_hda_id": None, "source": "pasted", "content": "user b content"}, + ) + + # User A uploads to their own history — this is what A's stream must observe. + self.dataset_populator.new_dataset(user_a_history_id, wait=False) + history_events = listener.wait_for_event("history_update") + finally: + listener.stop() + + seen_ids: set[str] = set() + for event in history_events: + seen_ids.update(json.loads(event["data"]).get("history_ids", [])) + assert user_a_history_id in seen_ids, f"User A missed its own history_update: {history_events}" + assert ( + user_b_history_id not in seen_ids + ), f"User A received history_update for user B's history ({user_b_history_id}): {history_events}" + + def test_existing_polling_api_still_works(self): + """The existing current_history_json endpoint should continue to work.""" + url = urljoin(self.url, "history/current_history_json") + response = requests.get( + url, + params={"key": self.galaxy_interactor.api_key}, + ) + assert response.status_code == 200 + data = response.json() + assert "id" in data + assert "update_time" in data diff --git a/test/integration/test_notification_sse.py b/test/integration/test_notification_sse.py new file mode 100644 index 000000000000..5c8622514a42 --- /dev/null +++ b/test/integration/test_notification_sse.py @@ -0,0 +1,186 @@ +"""Integration tests for the notification SSE (Server-Sent Events) endpoint.""" + +from datetime import datetime +from typing import Optional +from urllib.parse import urljoin +from uuid import uuid4 + +import requests + +from galaxy_test.base.populators import DatasetPopulator +from galaxy_test.base.sse import SSELineListener +from galaxy_test.driver.integration_util import IntegrationTestCase + + +def notification_test_data(subject: Optional[str] = None, message: Optional[str] = None) -> dict: + return { + "source": "integration_tests", + "variant": "info", + "category": "message", + "content": { + "category": "message", + "subject": subject or "Testing Subject", + "message": message or "Testing Message", + }, + } + + +def notification_broadcast_test_data(subject: Optional[str] = None, message: Optional[str] = None) -> dict: + return { + "source": "integration_tests", + "variant": "info", + "category": "broadcast", + "content": { + "category": "broadcast", + "subject": subject or "Testing Broadcast Subject", + "message": message or "Testing Broadcast Message", + }, + } + + +class TestNotificationSSEIntegration(IntegrationTestCase): + dataset_populator: DatasetPopulator + framework_tool_and_types = False + + @classmethod + def handle_galaxy_config_kwds(cls, config): + super().handle_galaxy_config_kwds(config) + config["enable_celery_tasks"] = False + config["enable_notification_system"] = True + + def setUp(self): + super().setUp() + self.dataset_populator = DatasetPopulator(self.galaxy_interactor) + + def _stream_url(self) -> str: + return urljoin(self.url, "api/notifications/stream") + + def test_sse_endpoint_returns_event_stream(self): + """The SSE endpoint should return content-type text/event-stream.""" + response = requests.get( + self._stream_url(), + params={"key": self.galaxy_interactor.api_key}, + stream=True, + timeout=5, + ) + assert response.status_code == 200 + assert "text/event-stream" in response.headers.get("content-type", "") + response.close() + + def test_sse_receives_notification_events(self): + """When a notification is created, the SSE stream should receive it.""" + user = self._setup_user(f"{uuid4()}@galaxy.test") + _, user_api_key = self._setup_user_get_key(user["email"]) + + listener = SSELineListener(self._stream_url(), user_api_key) + listener.start() + try: + subject = f"sse_test_{uuid4()}" + request = { + "recipients": {"user_ids": [user["id"]]}, + "notification": notification_test_data(subject=subject, message="SSE test notification"), + } + response = self._post("notifications", data=request, admin=True, json=True) + self._assert_status_code_is_ok(response) + + notification_events = listener.wait_for_event("notification_update") + finally: + listener.stop() + + assert any( + subject in e.get("data", "") for e in notification_events + ), f"Expected subject '{subject}' in SSE events, got: {notification_events}" + + def test_sse_receives_broadcast_events(self): + """When a broadcast is created, the SSE stream should receive it.""" + listener = SSELineListener(self._stream_url(), self.galaxy_interactor.api_key) + listener.start() + try: + subject = f"broadcast_sse_test_{uuid4()}" + payload = notification_broadcast_test_data(subject=subject) + response = self._post("notifications/broadcast", data=payload, admin=True, json=True) + self._assert_status_code_is_ok(response) + + broadcast_events = listener.wait_for_event("broadcast_update") + finally: + listener.stop() + + assert any( + subject in e.get("data", "") for e in broadcast_events + ), f"Expected subject '{subject}' in broadcast SSE events, got: {broadcast_events}" + + def test_sse_catchup_on_reconnect(self): + """Reconnecting with Last-Event-ID should replay a catch-up notification_status event. + + The ``Last-Event-ID`` value is the server-issued ID from a prior event, + not a client-side ``datetime.utcnow()``. This avoids clock-skew flake + between the test runner and the app in containerized CI. + """ + user = self._setup_user(f"{uuid4()}@galaxy.test") + _, user_api_key = self._setup_user_get_key(user["email"]) + + # First connection: capture the server-issued event id of the first notification. + listener_1 = SSELineListener(self._stream_url(), user_api_key) + listener_1.start() + try: + subject_1 = f"first_{uuid4()}" + request = { + "recipients": {"user_ids": [user["id"]]}, + "notification": notification_test_data(subject=subject_1), + } + response = self._post("notifications", data=request, admin=True, json=True) + self._assert_status_code_is_ok(response) + first_events = listener_1.wait_for_event("notification_update") + finally: + listener_1.stop() + + last_event_id = next((e["id"] for e in first_events if e.get("id")), None) + assert last_event_id, f"No server-issued id on first notification event: {first_events}" + + # Emit a second notification while disconnected; it should appear in the catch-up. + subject_2 = f"catchup_{uuid4()}" + request = { + "recipients": {"user_ids": [user["id"]]}, + "notification": notification_test_data(subject=subject_2, message="Catch-up test"), + } + response = self._post("notifications", data=request, admin=True, json=True) + self._assert_status_code_is_ok(response) + + # Reconnect with Last-Event-ID = the captured id. The server catch-up runs before + # the `ready` event and must include the missed notification. + listener_2 = SSELineListener( + self._stream_url(), + user_api_key, + headers={"Last-Event-ID": last_event_id}, + ) + listener_2.start() + try: + status_events = listener_2.wait_for_event("notification_status") + finally: + listener_2.stop() + + assert any( + subject_2 in e.get("data", "") for e in status_events + ), f"Expected subject '{subject_2}' in catch-up event, got: {status_events}" + + def test_existing_polling_api_still_works(self): + """The existing polling endpoint should continue to work alongside SSE.""" + user = self._setup_user(f"{uuid4()}@galaxy.test") + + before = datetime.utcnow() + + subject = f"polling_test_{uuid4()}" + request = { + "recipients": {"user_ids": [user["id"]]}, + "notification": notification_test_data(subject=subject), + } + response = self._post("notifications", data=request, admin=True, json=True) + self._assert_status_code_is_ok(response) + + with self._different_user(user["email"]): + status_response = self._get(f"notifications/status?since={before.isoformat()}") + self._assert_status_code_is_ok(status_response) + status = status_response.json() + assert status["total_unread_count"] == 1 + assert len(status["notifications"]) == 1 + assert status["notifications"][0]["content"]["subject"] == subject diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py new file mode 100644 index 000000000000..e3e9a564c844 --- /dev/null +++ b/test/integration_selenium/test_notification_sse.py @@ -0,0 +1,110 @@ +"""Playwright E2E test for the notification SSE pipeline. + +Verifies that when an admin creates a notification via the API, +a logged-in user sees it appear in the UI in real-time (within seconds) +without a page refresh, proving the SSE push pipeline works end-to-end. +""" + +from uuid import uuid4 + +from galaxy.util.wait import wait_on +from galaxy_test.selenium.framework import ( + managed_history, + selenium_test, +) +from .framework import SeleniumIntegrationTestCase + +SSE_CONNECT_TIMEOUT_SECONDS = 15 + + +class TestNotificationSSESeleniumIntegration(SeleniumIntegrationTestCase): + ensure_registered = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + super().handle_galaxy_config_kwds(config) + config["enable_notification_system"] = True + config["enable_celery_tasks"] = False + + def _wait_for_sse_connected(self) -> None: + """Block until the frontend confirms the SSE pipeline is live. + + Without this gate, the 30 s polling fallback silently masks a broken + SSE pipeline — the UI would still update, but via polling, and the + test would falsely pass. + """ + wait_on( + lambda: True if self.driver.execute_script("return window.__galaxy_sse_connected === true") else None, + "window.__galaxy_sse_connected === true", + timeout=SSE_CONNECT_TIMEOUT_SECONDS, + ) + + @selenium_test + @managed_history + def test_notification_appears_via_sse(self): + """Send a notification via the API and verify it appears in the UI without refresh.""" + # Get the logged-in user's info so we can send a notification to them + user_info = self._get("users/current").json() + user_id = user_info["id"] + + # Navigate to notifications page so the store is watching + self.driver.get(f"{self.target_url_from_selenium}/user/notifications") + self._wait_for_sse_connected() + self.screenshot("notification_sse_before") + + # Send a notification to this user via the admin API + subject = f"SSE E2E Test {uuid4()}" + notification_request = { + "recipients": {"user_ids": [user_id]}, + "notification": { + "source": "integration_tests", + "variant": "info", + "category": "message", + "content": { + "category": "message", + "subject": subject, + "message": "This notification was pushed via SSE", + }, + }, + } + response = self._post("notifications", data=notification_request, admin=True, json=True) + self._assert_status_code_is_ok(response) + + # Wait for the notification to appear in the UI — SSE should push it + # within a few seconds, without needing a page refresh. + # We wait up to 15 seconds checking for the subject text to appear. + self.driver.wait_for_selector_visible(f"text={subject}", timeout=15000) + self.screenshot("notification_sse_after") + + @selenium_test + @managed_history + def test_notification_bell_updates_via_sse(self): + """The notification bell indicator should update when a new notification arrives via SSE.""" + user_info = self._get("users/current").json() + user_id = user_info["id"] + + # Go to home page (bell is in masthead) + self.home() + self._wait_for_sse_connected() + + # Send a notification + subject = f"Bell Test {uuid4()}" + notification_request = { + "recipients": {"user_ids": [user_id]}, + "notification": { + "source": "integration_tests", + "variant": "info", + "category": "message", + "content": { + "category": "message", + "subject": subject, + "message": "Testing bell indicator update via SSE", + }, + }, + } + response = self._post("notifications", data=notification_request, admin=True, json=True) + self._assert_status_code_is_ok(response) + + # The indicator dot should appear on the bell (within the #activity-notifications element) + self.driver.wait_for_selector_visible("#activity-notifications .indicator", timeout=15000) + self.screenshot("notification_bell_indicator") From 2ba3863086d8756e7206bc46dfc49273551ed5eb Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 16 Apr 2026 22:27:44 +0200 Subject: [PATCH 02/47] Fix SSE control-queue routing for Celery + post-fork workers Three related issues uncovered while exercising the SSE push path end-to-end: - Celery workers could not publish SSE control tasks. Move the AMQP connection + publisher-only GalaxyQueueWorker construction up into GalaxyManagerApplication so every manager app gets a publisher, and add an explicit bind_publisher() entry point. Web workers still start a consumer via bind_and_start post-fork. - Under gunicorn --preload, config.server_name is rewritten post-fork (main -> main.1) but the pre-fork publisher bindings were never refreshed, so the consumer listened on control.main@host while producers published to control.main.1@host. bind_and_start now always re-invokes bind_publisher so consumer queues match the post-fork identity. - all_control_queues_for_declare required database_heartbeat, which Celery workers don't run. Query WorkerProcess directly (with an active-window filter), and add a webapp_only flag so SSEEventDispatcher only fans out to processes that actually have browser connections. --- lib/galaxy/app/__init__.py | 15 +++++++--- lib/galaxy/managers/sse.py | 23 +++++++-------- lib/galaxy/queue_worker/__init__.py | 33 ++++++++++++++++++---- lib/galaxy/queues/__init__.py | 44 +++++++++++++++++++++++++---- 4 files changed, 89 insertions(+), 26 deletions(-) diff --git a/lib/galaxy/app/__init__.py b/lib/galaxy/app/__init__.py index 672efde7dc6e..7c1a1e097243 100644 --- a/lib/galaxy/app/__init__.py +++ b/lib/galaxy/app/__init__.py @@ -682,6 +682,14 @@ def __init__( self.role_manager = self._register_singleton(RoleManager) self.job_manager = self._register_singleton(JobManager) + # AMQP connection + a publisher-ready queue worker. Celery worker processes + # inherit this via GalaxyManagerApplication so they can fan out SSE events + # to web workers (no consumer thread is started here — see bind_publisher). + self.amqp_internal_connection_obj = galaxy.queues.connection_from_config(self.config) + if self.amqp_internal_connection_obj is not None: + self.queue_worker = self._register_singleton(GalaxyQueueWorker, GalaxyQueueWorker(self)) + self.queue_worker.bind_publisher() + # SSE dispatcher must be registered before NotificationManager so Lagom # can auto-inject the Optional[SSEEventDispatcher] constructor arg. self._register_singleton(SSEEventDispatcher, SSEEventDispatcher(self)) @@ -840,10 +848,9 @@ def __init__(self, **kwargs) -> None: # A lot of postfork initialization depends on the server name, ensure it is set immediately after forking before other postfork functions self.application_stack.register_postfork_function(self.application_stack.set_postfork_server_name, self) self.config.reload_sanitize_allowlist(explicit="sanitize_allowlist_file" in kwargs) - self.amqp_internal_connection_obj = galaxy.queues.connection_from_config(self.config) - # queue_worker *can* be initialized with a queue, but here we don't - # want to and we'll allow postfork to bind and start it. - self.queue_worker = self._register_singleton(GalaxyQueueWorker, GalaxyQueueWorker(self)) + # amqp_internal_connection_obj and queue_worker are built in GalaxyManagerApplication + # (so Celery workers also get a publisher); here we only register the consumer path, + # which is started later via the application_stack postfork hook. # SSE connection manager for real-time notification push self.sse_connection_manager = self._register_singleton(SSEConnectionManager) diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py index d63532294f9e..ade467c3fe53 100644 --- a/lib/galaxy/managers/sse.py +++ b/lib/galaxy/managers/sse.py @@ -176,14 +176,11 @@ async def stream( class SSEEventDispatcher: """Fans out SSE events across all Galaxy worker processes via the control queue. - This is a thin wrapper around ``send_control_task`` so that managers can - depend on a narrow, injectable collaborator instead of importing the - queue-worker module directly. - - In Celery / background-task processes the app object has no ``queue_worker`` - (it's only built in ``UniverseApplication``), so dispatch is silently a - no-op there — notifications created from Celery tasks will still be - delivered the next time a client polls, just not pushed in real time. + Thin wrapper around ``send_control_task`` so that managers can depend on a + narrow, injectable collaborator instead of importing the queue-worker + module directly. Works in both web-worker and Celery-worker contexts — + ``GalaxyManagerApplication`` sets up a publisher-only ``queue_worker`` for + the Celery side. """ def __init__(self, app: "MinimalManagerApp") -> None: @@ -191,12 +188,16 @@ def __init__(self, app: "MinimalManagerApp") -> None: def _send(self, task: str, kwargs: dict) -> None: if getattr(self._app, "queue_worker", None) is None: - # No control-queue publisher available (e.g. Celery worker context). - log.debug("SSE dispatch skipped: app has no queue_worker (task=%s)", task) + # AMQP not configured at all (e.g. unit-test mock app). Skip silently. + log.debug("SSE dispatch skipped: no queue_worker configured (task=%s)", task) return from galaxy.queue_worker import send_control_task # circular: queue_worker -> app -> managers + from galaxy.queues import all_control_queues_for_declare - send_control_task(self._app, task, kwargs=kwargs, expiration=10) + # Only fan out to webapp processes — job handlers and workflow schedulers + # don't have browser SSE connections to push to. + declare_queues = all_control_queues_for_declare(self._app.application_stack, webapp_only=True) + send_control_task(self._app, task, kwargs=kwargs, expiration=10, declare_queues=declare_queues) def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str] = None) -> None: self._send( diff --git a/lib/galaxy/queue_worker/__init__.py b/lib/galaxy/queue_worker/__init__.py index 0da4b289117a..6904a3a9e934 100644 --- a/lib/galaxy/queue_worker/__init__.py +++ b/lib/galaxy/queue_worker/__init__.py @@ -75,6 +75,7 @@ def send_control_task( routing_key="control.*", kwargs=None, expiration: Optional[int] = None, + declare_queues=None, ): """ This sends a control task out to all processes, useful for things like @@ -84,6 +85,8 @@ def send_control_task( Set get_response to True to wait for and return the task results as a list. Set expiration to a number of seconds for message TTL. + Pass ``declare_queues`` to override the default active-processes list — + e.g. the SSE dispatcher uses this to restrict fan-out to webapp processes. """ if kwargs is None: kwargs = {} @@ -93,7 +96,11 @@ def send_control_task( payload["noop"] = app.config.server_name control_task = ControlTask(app.queue_worker) return control_task.send_task( - payload=payload, routing_key=routing_key, get_response=get_response, expiration=expiration + payload=payload, + routing_key=routing_key, + get_response=get_response, + expiration=expiration, + declare_queues=declare_queues, ) @@ -136,10 +143,11 @@ def send_task( get_response=False, timeout=10, expiration: Optional[int] = None, + declare_queues=None, ): if local: declare_queues = self.control_queues - else: + elif declare_queues is None: declare_queues = self.declare_queues reply_to = None callback_queue = [] @@ -459,17 +467,32 @@ def send_local_control_task(self, task, get_response=False, kwargs=None): @property def declare_queues(self): - # dynamically produce queues, allows addressing all known processes at a given time + # Dynamically produce queues, allows addressing all known processes at a given time. return galaxy.queues.all_control_queues_for_declare(self.app.application_stack) + def bind_publisher(self): + """Set up the queues needed to PUBLISH control tasks (no consumer thread). + + Safe to call from any process that needs to produce control messages — notably + Celery workers, which want to fan out SSE events to web workers but must not + start a consumer themselves. + + Always (re)binds. A prefork call in ``GalaxyManagerApplication.__init__`` binds + using the parent's ``config.server_name``; under gunicorn with ``--preload`` + the child's ``set_postfork_server_name`` mutates ``server_name`` to e.g. + ``main.1`` after fork. ``bind_and_start`` calls back into this so the + consumer's queues match what post-fork producers declare. + """ + self.exchange_queue, self.direct_queue = galaxy.queues.control_queues_from_config(self.app.config) + self.control_queues = [self.exchange_queue, self.direct_queue] + def bind_and_start(self): # This is post-forking, so we got the correct sever name log.info( "Binding and starting galaxy control worker for %s", self.app.config.server_name, ) - self.exchange_queue, self.direct_queue = galaxy.queues.control_queues_from_config(self.app.config) - self.control_queues = [self.exchange_queue, self.direct_queue] + self.bind_publisher() self.epoch = time.time() self.start() diff --git a/lib/galaxy/queues/__init__.py b/lib/galaxy/queues/__init__.py index ab6edafcf5b0..9643a395595b 100644 --- a/lib/galaxy/queues/__init__.py +++ b/lib/galaxy/queues/__init__.py @@ -4,6 +4,8 @@ """ +import datetime +import logging import socket from typing import Optional @@ -12,21 +14,51 @@ Exchange, Queue, ) +from sqlalchemy import select + +from galaxy.model import WorkerProcess +from galaxy.model.orm.now import now + +log = logging.getLogger(__name__) ALL_CONTROL = "control.*" galaxy_exchange = Exchange("galaxy_core_exchange", type="topic") +DEFAULT_ACTIVE_PROCESS_WINDOW_SECONDS = 120 +# Matches WorkerProcess.app_type set by DatabaseHeartbeat for webapp processes. +WEBAPP_APP_TYPE = "webapp" + -def all_control_queues_for_declare(application_stack): +def all_control_queues_for_declare(application_stack, webapp_only: bool = False): """ For in-memory routing (used by sqlalchemy-based transports), we need to be able to build the entire routing table in producers. + + Queries ``WorkerProcess`` directly rather than going through + ``DatabaseHeartbeat`` so this works from Celery workers too — they have a + ``model`` but no heartbeat thread. Without this, a notification created in + a Celery task publishes a ``notify_users`` control task with an empty + ``declare`` list, so on the sqlalchemy+sqlite kombu transport the message + never lands in a web worker's queue. + + When ``webapp_only`` is True, only returns queues for processes that have + registered themselves with ``app_type='webapp'``. This is what the SSE + dispatcher wants: job handlers and workflow schedulers have no browser + connections, so routing SSE events to them is wasted work. """ - # Get all active processes and construct queues for each process - process_names = ( - f"{p.server_name}@{p.hostname}" for p in application_stack.app.database_heartbeat.get_active_processes() - ) - return [Queue(f"control.{server_name}", galaxy_exchange, routing_key="control.*") for server_name in process_names] + app = application_stack.app + try: + stmt = select(WorkerProcess).where( + WorkerProcess.update_time > now() - datetime.timedelta(seconds=DEFAULT_ACTIVE_PROCESS_WINDOW_SECONDS) + ) + if webapp_only: + stmt = stmt.where(WorkerProcess.app_type == WEBAPP_APP_TYPE) + with app.model.new_session() as session: + processes = session.scalars(stmt).all() + except Exception: + log.debug("Failed to look up active processes for control-queue declare", exc_info=True) + return [] + return [Queue(f"control.{p.server_name}@{p.hostname}", galaxy_exchange, routing_key="control.*") for p in processes] def control_queues_from_config(config): From 276ffab6f9071b7d67b28f8915371d479e1435d7 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 16 Apr 2026 22:27:53 +0200 Subject: [PATCH 03/47] Fix history SSE fallback switching and forced refresh The first iteration of SSE-driven history updates had two client bugs: - startWatchingHistoryWithSSE() was called from many places (component mounts, upload/tool/workflow completion hooks) and each call re-opened the EventSource. That flapped `connected` false -> true repeatedly, so the watch that was supposed to stop the 3s poll on SSE-up never had a stable transition and polling kept running. Make SSE init idempotent and drive start/stop of the polling fallback from the connected ref. - watchHistoryOnce short-circuits when lastUpdateTime >= history.update_time, so an SSE push arriving before the client's next poll tick produced no UI change until a full history reload. Add refreshHistoryFromPush() that forces the fetch, skipping both the `since` query param and the update_time gate, and use it from the SSE handler. --- client/src/stores/historyStore.ts | 46 ++++++++++++++---- client/src/watch/watchHistory.js | 79 +++++++++++++++++-------------- 2 files changed, 80 insertions(+), 45 deletions(-) diff --git a/client/src/stores/historyStore.ts b/client/src/stores/historyStore.ts index 1d6c40acebc8..91d8462f6e98 100644 --- a/client/src/stores/historyStore.ts +++ b/client/src/stores/historyStore.ts @@ -1,5 +1,5 @@ import { defineStore } from "pinia"; -import { computed, del, ref, set } from "vue"; +import { computed, del, ref, set, watch } from "vue"; import { type AnyHistory, @@ -31,8 +31,8 @@ import { sortByObjectProp } from "@/utils/sorting"; import { ACTIVE_POLLING_INTERVAL, INACTIVE_POLLING_INTERVAL, + refreshHistoryFromPush as refreshHistoryFromPushSuppliedApp, watchHistory as watchHistorySuppliedApp, - watchHistoryOnce as watchHistoryOnceSuppliedApp, } from "@/watch/watchHistory"; const PAGINATION_LIMIT = 10; @@ -396,23 +396,28 @@ export const useHistoryStore = defineStore("historyStore", () => { // SSE-driven history updates: when we receive a history_update event, // immediately trigger a refresh of the current history const SSE_HISTORY_EVENT_TYPES = ["history_update"] as const; - const { connect: sseHistoryConnect } = useSSE(handleHistorySSEEvent, SSE_HISTORY_EVENT_TYPES); + const { connect: sseHistoryConnect, connected: sseHistoryConnected } = useSSE( + handleHistorySSEEvent, + SSE_HISTORY_EVENT_TYPES, + ); function handleHistorySSEEvent(event: MessageEvent) { try { const data = JSON.parse(event.data); const changedHistoryIds: string[] = data.history_ids ?? []; - // If the current history was updated, trigger a refresh if (currentHistoryId.value && changedHistoryIds.includes(currentHistoryId.value)) { + // SSE is itself the signal that the history changed — force the + // refresh so the update_time short-circuit in watchHistoryOnce + // can't suppress the contents fetch. const app = getGalaxyInstance(); - watchHistoryOnceSuppliedApp(app); + refreshHistoryFromPushSuppliedApp(app).catch((err) => + console.error("Error refreshing history from SSE push:", err), + ); } } catch (e) { console.error("Error handling history SSE event:", e); } } - - // Polling fallback — keeps running as a safety net even when SSE is connected const { startWatchingResource: startWatchingHistory, stopWatchingResource: stopWatchingHistory, @@ -422,11 +427,32 @@ export const useHistoryStore = defineStore("historyStore", () => { longPollingInterval: INACTIVE_POLLING_INTERVAL, }); + // When the SSE pipeline is live it delivers history_update events directly + // and we stop the 3-second poll. If SSE drops (server unsupported, repeated + // errors, network blip) the watch below resumes polling as the fallback. + // + // The public `startWatchingHistory` alias is called from many places (component + // mounts, upload/tool/workflow completion callbacks). We must NOT reconnect SSE + // on every call — that would flap `connected` false→true repeatedly, and + // `onopen` may never stably fire. Initialize SSE exactly once. + let sseInitialized = false; function startWatchingHistoryWithSSE() { - // Always start polling as a baseline - startWatchingHistory(); - // Also connect SSE for instant updates + if (sseInitialized) { + return; + } + sseInitialized = true; sseHistoryConnect(); + watch( + sseHistoryConnected, + (isConnected) => { + if (isConnected) { + stopWatchingHistory(); + } else { + startWatchingHistory(); + } + }, + { immediate: true }, + ); } async function loadHistoryById(historyId: string) { diff --git a/client/src/watch/watchHistory.js b/client/src/watch/watchHistory.js index 2860325e4610..e99f8d5676e4 100644 --- a/client/src/watch/watchHistory.js +++ b/client/src/watch/watchHistory.js @@ -39,53 +39,62 @@ export async function watchHistory(app) { } export async function watchHistoryOnce(app) { + return _fetchHistoryAndChangedItems(app, { force: false }); +} + +/** + * Forces a fresh history + changed-items fetch, ignoring the `lastUpdateTime` + * short-circuit. Use this when an out-of-band signal (SSE `history_update`) + * already told us the history changed — we shouldn't re-gate on update_time. + */ +export async function refreshHistoryFromPush(app) { + return _fetchHistoryAndChangedItems(app, { force: true }); +} + +async function _fetchHistoryAndChangedItems(app, { force }) { const historyStore = useHistoryStore(); const historyItemsStore = useHistoryItemsStore(); const datasetStore = useDatasetStore(); const collectionElementsStore = useCollectionElementsStore(); - // get current history const checkForUpdate = new Date(); - const history = await historyStore.loadCurrentHistory(lastUpdateTime); + // When forced, skip the `since` filter so the server always returns the history. + const history = await historyStore.loadCurrentHistory(force ? undefined : lastUpdateTime); const { lastCheckedTime } = storeToRefs(historyItemsStore); lastCheckedTime.value = checkForUpdate; if (!history || !history.id) { return; } - // continue if the history update time has changed - if (!lastUpdateTime || lastUpdateTime < history.update_time) { - const historyId = history.id; - lastUpdateTime = history.update_time; - historyItemsStore.setLastUpdateTime(); - // execute request to obtain recently changed items - const params = { - v: "dev", - limit: limit, - q: "update_time-ge", - qv: lastRequestDate.toISOString(), - }; - // request detailed info only for the expanded datasets - const detailedIds = getCurrentlyExpandedHistoryContentIds(); - if (detailedIds.length) { - params["details"] = detailedIds.join(","); - } - const url = `/api/histories/${historyId}/contents`; - lastRequestDate = new Date(); - const payload = await urlData({ url, params }); - // show warning that not all changes have been obtained - if (payload && payload.length == limit) { - console.debug(`Reached limit of monitored changes (limit=${limit}).`); - } - // pass changed items to attached stores - historyStore.setHistory(history); - datasetStore.saveDatasets(payload); - historyItemsStore.saveHistoryItems(historyId, payload); - collectionElementsStore.saveCollections(payload); - // trigger changes in legacy handler - if (app) { - app.user.loadFromApi(app.user.id || "current"); - } + if (!force && lastUpdateTime && lastUpdateTime >= history.update_time) { + return; + } + + const historyId = history.id; + lastUpdateTime = history.update_time; + historyItemsStore.setLastUpdateTime(); + const params = { + v: "dev", + limit: limit, + q: "update_time-ge", + qv: lastRequestDate.toISOString(), + }; + const detailedIds = getCurrentlyExpandedHistoryContentIds(); + if (detailedIds.length) { + params["details"] = detailedIds.join(","); + } + const url = `/api/histories/${historyId}/contents`; + lastRequestDate = new Date(); + const payload = await urlData({ url, params }); + if (payload && payload.length == limit) { + console.debug(`Reached limit of monitored changes (limit=${limit}).`); + } + historyStore.setHistory(history); + datasetStore.saveDatasets(payload); + historyItemsStore.saveHistoryItems(historyId, payload); + collectionElementsStore.saveCollections(payload); + if (app) { + app.user.loadFromApi(app.user.id || "current"); } } From 0e19f5fb64d0ed5d7cac4b521f2e2ad15e7eb9ba Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 16 Apr 2026 22:33:23 +0200 Subject: [PATCH 04/47] Split SSEEventDispatcher into its own module to drop inline imports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SSEEventDispatcher lived in galaxy.managers.sse alongside the pure connection types (SSEEvent, SSEConnectionManager). That forced two local imports inside _send() — send_control_task and all_control_queues_for_declare — because galaxy.queue_worker depends on SSEConnectionManager/SSEEvent from sse.py, so hoisting the queue_worker import to the top of sse.py would create a cycle. Move the dispatcher to galaxy.managers.sse_dispatch. It can then import from galaxy.queue_worker and galaxy.queues at module top: the new module is a leaf with respect to those dependencies (queue_worker never imports it back), so there's no cycle. Callers (app, notification manager, history audit monitor) import SSEEventDispatcher from its new home. Note: the conditional `import psycopg` / `import psycopg2` in history_audit_monitor.py and the feature-flagged HistoryAuditMonitor import in app.py are retained — both are legitimate optional/gated loads, not cycle workarounds. --- lib/galaxy/app/__init__.py | 6 +- lib/galaxy/managers/history_audit_monitor.py | 2 +- lib/galaxy/managers/notification.py | 2 +- lib/galaxy/managers/sse.py | 58 --------------- lib/galaxy/managers/sse_dispatch.py | 76 ++++++++++++++++++++ 5 files changed, 80 insertions(+), 64 deletions(-) create mode 100644 lib/galaxy/managers/sse_dispatch.py diff --git a/lib/galaxy/app/__init__.py b/lib/galaxy/app/__init__.py index 7c1a1e097243..1eac0e496d1b 100644 --- a/lib/galaxy/app/__init__.py +++ b/lib/galaxy/app/__init__.py @@ -77,10 +77,8 @@ from galaxy.managers.object_store_instances import UserObjectStoreResolverImpl from galaxy.managers.roles import RoleManager from galaxy.managers.session import GalaxySessionManager -from galaxy.managers.sse import ( - SSEConnectionManager, - SSEEventDispatcher, -) +from galaxy.managers.sse import SSEConnectionManager +from galaxy.managers.sse_dispatch import SSEEventDispatcher from galaxy.managers.tasks import ( AsyncTasksManager, CeleryAsyncTasksManager, diff --git a/lib/galaxy/managers/history_audit_monitor.py b/lib/galaxy/managers/history_audit_monitor.py index 4a9759bd8375..e8f3fadd08e3 100644 --- a/lib/galaxy/managers/history_audit_monitor.py +++ b/lib/galaxy/managers/history_audit_monitor.py @@ -29,7 +29,7 @@ from sqlalchemy.engine import Engine from galaxy.config import GalaxyAppConfiguration -from galaxy.managers.sse import SSEEventDispatcher +from galaxy.managers.sse_dispatch import SSEEventDispatcher from galaxy.model import ( History, HistoryAudit, diff --git a/lib/galaxy/managers/notification.py b/lib/galaxy/managers/notification.py index d82140549b52..081fad7b7aa9 100644 --- a/lib/galaxy/managers/notification.py +++ b/lib/galaxy/managers/notification.py @@ -37,7 +37,7 @@ ObjectNotFound, ) from galaxy.managers.markdown_util import to_html -from galaxy.managers.sse import SSEEventDispatcher +from galaxy.managers.sse_dispatch import SSEEventDispatcher from galaxy.model import ( GroupRoleAssociation, Notification, diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py index ade467c3fe53..3315ff6a6268 100644 --- a/lib/galaxy/managers/sse.py +++ b/lib/galaxy/managers/sse.py @@ -9,7 +9,6 @@ import logging from collections import defaultdict from dataclasses import dataclass -from datetime import datetime from typing import ( AsyncIterator, Optional, @@ -19,8 +18,6 @@ if TYPE_CHECKING: from starlette.requests import Request - from galaxy.structured_app import MinimalManagerApp - log = logging.getLogger(__name__) @@ -171,58 +168,3 @@ async def stream( yield ": keepalive\n\n" finally: self.disconnect(user_id, queue) - - -class SSEEventDispatcher: - """Fans out SSE events across all Galaxy worker processes via the control queue. - - Thin wrapper around ``send_control_task`` so that managers can depend on a - narrow, injectable collaborator instead of importing the queue-worker - module directly. Works in both web-worker and Celery-worker contexts — - ``GalaxyManagerApplication`` sets up a publisher-only ``queue_worker`` for - the Celery side. - """ - - def __init__(self, app: "MinimalManagerApp") -> None: - self._app = app - - def _send(self, task: str, kwargs: dict) -> None: - if getattr(self._app, "queue_worker", None) is None: - # AMQP not configured at all (e.g. unit-test mock app). Skip silently. - log.debug("SSE dispatch skipped: no queue_worker configured (task=%s)", task) - return - from galaxy.queue_worker import send_control_task # circular: queue_worker -> app -> managers - from galaxy.queues import all_control_queues_for_declare - - # Only fan out to webapp processes — job handlers and workflow schedulers - # don't have browser SSE connections to push to. - declare_queues = all_control_queues_for_declare(self._app.application_stack, webapp_only=True) - send_control_task(self._app, task, kwargs=kwargs, expiration=10, declare_queues=declare_queues) - - def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str] = None) -> None: - self._send( - "notify_users", - { - "user_ids": user_ids, - "payload": payload, - "event_id": event_id or datetime.utcnow().isoformat(), - }, - ) - - def notify_broadcast(self, payload: str, event_id: Optional[str] = None) -> None: - self._send( - "notify_broadcast", - { - "payload": payload, - "event_id": event_id or datetime.utcnow().isoformat(), - }, - ) - - def history_update(self, user_updates: dict[str, list], event_id: Optional[str] = None) -> None: - self._send( - "history_update", - { - "user_updates": user_updates, - "event_id": event_id or datetime.utcnow().isoformat(), - }, - ) diff --git a/lib/galaxy/managers/sse_dispatch.py b/lib/galaxy/managers/sse_dispatch.py new file mode 100644 index 000000000000..35f6c690d37f --- /dev/null +++ b/lib/galaxy/managers/sse_dispatch.py @@ -0,0 +1,76 @@ +"""Producer-side SSE helper: fans events out across Galaxy processes. + +Kept in its own module (separate from ``galaxy.managers.sse``) because the +dispatcher depends on ``galaxy.queue_worker`` and ``galaxy.queues``, while +``queue_worker`` in turn depends on the connection types in ``sse``. Splitting +the producer (``SSEEventDispatcher``) from the connection state +(``SSEConnectionManager`` / ``SSEEvent``) breaks that cycle without requiring +inline imports in the hot path. +""" + +import logging +from datetime import datetime +from typing import ( + Optional, + TYPE_CHECKING, +) + +from galaxy.queue_worker import send_control_task +from galaxy.queues import all_control_queues_for_declare + +if TYPE_CHECKING: + from galaxy.structured_app import MinimalManagerApp + +log = logging.getLogger(__name__) + + +class SSEEventDispatcher: + """Fans out SSE events across all Galaxy worker processes via the control queue. + + Thin wrapper around ``send_control_task`` so managers can depend on a narrow, + injectable collaborator instead of reaching into the queue-worker module + directly. Works in both web-worker and Celery-worker contexts — + ``GalaxyManagerApplication`` sets up a publisher-only ``queue_worker`` for + the Celery side. + """ + + def __init__(self, app: "MinimalManagerApp") -> None: + self._app = app + + def _send(self, task: str, kwargs: dict) -> None: + if getattr(self._app, "queue_worker", None) is None: + # AMQP not configured at all (e.g. unit-test mock app). Skip silently. + log.debug("SSE dispatch skipped: no queue_worker configured (task=%s)", task) + return + # Only fan out to webapp processes — job handlers and workflow schedulers + # don't have browser SSE connections to push to. + declare_queues = all_control_queues_for_declare(self._app.application_stack, webapp_only=True) + send_control_task(self._app, task, kwargs=kwargs, expiration=10, declare_queues=declare_queues) + + def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str] = None) -> None: + self._send( + "notify_users", + { + "user_ids": user_ids, + "payload": payload, + "event_id": event_id or datetime.utcnow().isoformat(), + }, + ) + + def notify_broadcast(self, payload: str, event_id: Optional[str] = None) -> None: + self._send( + "notify_broadcast", + { + "payload": payload, + "event_id": event_id or datetime.utcnow().isoformat(), + }, + ) + + def history_update(self, user_updates: dict[str, list], event_id: Optional[str] = None) -> None: + self._send( + "history_update", + { + "user_updates": user_updates, + "event_id": event_id or datetime.utcnow().isoformat(), + }, + ) From c364fef7014bcd222e8a5bb43dbdf95240d16ac9 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 14:44:58 +0200 Subject: [PATCH 05/47] Add pg_notify to history audit triggers for existing installs The SSE history-update pipeline (HistoryAuditMonitor LISTENing on galaxy_history_update) only fires on pg_notify. Fresh installs get the notify-emitting trigger functions from update_audit_table.py, but the last trigger-touching migration (c716ee82337b) re-created the functions without pg_notify, so upgraded databases silently never dispatched events. This revision replaces both audit trigger functions with notify-enabled versions; SQLite is a no-op. --- ...add_pg_notify_to_history_audit_triggers.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py diff --git a/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py b/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py new file mode 100644 index 000000000000..6e410a62e5b1 --- /dev/null +++ b/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py @@ -0,0 +1,104 @@ +"""Add pg_notify to history audit triggers + +Revision ID: b8d5e2f9a1c7 +Revises: f5e9e4bca542 +Create Date: 2026-04-17 14:30:00.000000 + +The SSE-based history update pipeline (see `managers/history_audit_monitor.py`) +depends on a PostgreSQL LISTEN on the `galaxy_history_update` channel. For +existing installations, trigger functions installed by earlier migrations (most +recently `c716ee82337b_replace_triggers`) do not emit the corresponding +`pg_notify`, so the monitor wakes up only from the poll-timeout fallback and +per-history events are never dispatched in real time. This revision replaces +both audit trigger functions with versions that emit `pg_notify` for each +affected history id, matching `model/triggers/update_audit_table.py` used on +fresh installs. + +SQLite installations use the poll-only path and require no change. +""" + +from alembic import op + +from galaxy.model.migrations.util import ( + _is_sqlite, + transaction, +) + +revision = "b8d5e2f9a1c7" +down_revision = "f5e9e4bca542" +branch_labels = None +depends_on = None + + +CHANNEL = "galaxy_history_update" + + +def upgrade(): + if _is_sqlite(): + return + with transaction(): + _install_functions(with_notify=True) + + +def downgrade(): + if _is_sqlite(): + return + with transaction(): + _install_functions(with_notify=False) + + +def _install_functions(with_notify: bool): + version_info = op.get_bind().engine.dialect.server_version_info + use_statement = version_info is None or version_info[0] >= 10 + builder = _statement_trigger_fn if use_statement else _row_trigger_fn + for id_field in ("history_id", "id"): + op.execute(builder(f"fn_audit_history_by_{id_field}", id_field, with_notify)) + + +def _statement_trigger_fn(function_name: str, id_field: str, with_notify: bool) -> str: + notify_block = ( + f""" + FOR _history_id IN SELECT DISTINCT {id_field} FROM new_table WHERE {id_field} IS NOT NULL + LOOP + PERFORM pg_notify('{CHANNEL}', _history_id::text); + END LOOP; + """ + if with_notify + else "" + ) + declare_block = "DECLARE _history_id integer;" if with_notify else "" + return f""" + CREATE OR REPLACE FUNCTION {function_name}() + RETURNS TRIGGER + LANGUAGE 'plpgsql' + AS $BODY$ + {declare_block} + BEGIN + INSERT INTO history_audit (history_id, update_time) + SELECT DISTINCT {id_field}, clock_timestamp() AT TIME ZONE 'UTC' + FROM new_table + WHERE {id_field} IS NOT NULL + ON CONFLICT DO NOTHING; + {notify_block} + RETURN NULL; + END; + $BODY$ + """ + + +def _row_trigger_fn(function_name: str, id_field: str, with_notify: bool) -> str: + notify_stmt = f"PERFORM pg_notify('{CHANNEL}', NEW.{id_field}::text);" if with_notify else "" + return f""" + CREATE OR REPLACE FUNCTION {function_name}() + RETURNS TRIGGER + LANGUAGE 'plpgsql' + AS $BODY$ + BEGIN + INSERT INTO history_audit (history_id, update_time) + VALUES (NEW.{id_field}, clock_timestamp() AT TIME ZONE 'UTC') + ON CONFLICT DO NOTHING; + {notify_stmt} + RETURN NULL; + END; + $BODY$ + """ From 5d1a1c7a52da7c4d9a087a2b4563231589c6442d Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 14:45:15 +0200 Subject: [PATCH 06/47] Drive history/notifications SSE from config, not runtime socket state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fallback pattern watched EventSource `connected` to switch between SSE and polling. That conflates two unrelated things: network blips (which EventSource auto-recovers from) and deployment config (which decides whether events ever arrive at all). In particular, `/api/events/stream` accepts connections even when HistoryAuditMonitor is disabled, so the watch-based design silently stopped polling and waited forever for events that never came. Gate the choice on the server config flag instead: - Expose `enable_sse_history_updates` via ConfigSerializer so the client can read it alongside `enable_notification_system`. - In historyStore/notificationsStore, read the flag once after config loads: if true, prime an initial fetch and connect SSE; if false, start the resource watcher for polling. No runtime toggle. - Instantiate `useResourceWatcher` lazily inside the polling branch so its `visibilitychange` listener — which would otherwise restart polling on every tab focus — is never registered in SSE mode. - Drop the 5-error permanent-give-up in `useSSE`; with no runtime fallback it would just freeze the client. Rely on EventSource's native Last-Event-ID reconnect. Add Vitest coverage for both stores covering SSE-on and SSE-off scenarios, including a tab-visibility toggle that must not re-arm polling in SSE mode. --- client/src/composables/useNotificationSSE.ts | 14 +- client/src/stores/historyStore.test.ts | 179 +++++++++++++++++ client/src/stores/historyStore.ts | 92 ++++++--- client/src/stores/notificationsStore.test.ts | 196 +++++++++++++++++++ client/src/stores/notificationsStore.ts | 103 ++++++---- lib/galaxy/managers/configuration.py | 1 + 6 files changed, 501 insertions(+), 84 deletions(-) create mode 100644 client/src/stores/historyStore.test.ts create mode 100644 client/src/stores/notificationsStore.test.ts diff --git a/client/src/composables/useNotificationSSE.ts b/client/src/composables/useNotificationSSE.ts index 84b03373cfa7..088d3d1bf054 100644 --- a/client/src/composables/useNotificationSSE.ts +++ b/client/src/composables/useNotificationSSE.ts @@ -26,11 +26,9 @@ export type SSEEventType = (typeof SSE_EVENT_TYPES)[number]; export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: readonly SSEEventType[] = SSE_EVENT_TYPES) { const connected = ref(false); let eventSource: EventSource | null = null; - let consecutiveErrors = 0; function connect() { disconnect(); - consecutiveErrors = 0; const url = withPrefix("/api/events/stream"); eventSource = new EventSource(url); @@ -40,22 +38,18 @@ export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: reado eventSource.onopen = () => { connected.value = true; - consecutiveErrors = 0; // Expose a global readiness flag so Selenium tests can distinguish // a working SSE pipeline from the polling fallback. (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = true; }; eventSource.onerror = () => { + // EventSource auto-reconnects natively; SSE-vs-polling is a + // config-level decision (see historyStore / notificationsStore), + // so we must not give up on transient errors here — doing so + // would leave the client with no updates at all. connected.value = false; (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = false; - consecutiveErrors++; - // EventSource auto-reconnects, but if we get too many errors - // in a row, the server likely doesn't support SSE — give up - // and let the caller fall back to polling. - if (consecutiveErrors > 5) { - disconnect(); - } }; } diff --git a/client/src/stores/historyStore.test.ts b/client/src/stores/historyStore.test.ts new file mode 100644 index 000000000000..2cf0cecbe24a --- /dev/null +++ b/client/src/stores/historyStore.test.ts @@ -0,0 +1,179 @@ +import flushPromises from "flush-promises"; +import { createPinia, setActivePinia } from "pinia"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { ref } from "vue"; + +import { useServerMock } from "@/api/client/__mocks__"; + +import { useHistoryStore } from "./historyStore"; + +// Capture SSE composable usage — neither test should rely on the real +// EventSource. The `connected` ref stays false by default; the store must +// NOT key polling behavior off it. +const mockSseConnect = vi.fn(); +const mockSseDisconnect = vi.fn(); +const mockSseConnected = ref(false); + +vi.mock("@/composables/useNotificationSSE", () => ({ + useSSE: vi.fn(() => ({ + connect: mockSseConnect, + disconnect: mockSseDisconnect, + connected: mockSseConnected, + })), +})); + +// `watchHistory(app)` is the polling handler invoked on the short/long +// interval. We mock it so each invocation is observable without pulling in +// the history-items store, dataset store, and Galaxy app instance. +const mockWatchHistory = vi.fn().mockResolvedValue(undefined); +const mockRefreshHistoryFromPush = vi.fn().mockResolvedValue(undefined); +vi.mock("@/watch/watchHistory", () => ({ + ACTIVE_POLLING_INTERVAL: 3000, + INACTIVE_POLLING_INTERVAL: 60_000, + watchHistory: (app: unknown) => mockWatchHistory(app), + refreshHistoryFromPush: (app: unknown) => mockRefreshHistoryFromPush(app), +})); + +vi.mock("@/app", () => ({ + getGalaxyInstance: () => ({ name: "fake-galaxy" }), +})); + +const { server, http } = useServerMock(); + +function configResponse(enableSse: boolean) { + return { enable_sse_history_updates: enableSse }; +} + +function registerDefaultHandlers({ enableSse }: { enableSse: boolean }) { + server.use( + http.get("/api/configuration", ({ response }) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return response(200).json(configResponse(enableSse) as any); + }), + ); +} + +describe("historyStore — config-driven SSE vs polling", () => { + beforeEach(() => { + setActivePinia(createPinia()); + mockSseConnect.mockClear(); + mockSseDisconnect.mockClear(); + mockSseConnected.value = false; + mockWatchHistory.mockClear(); + mockRefreshHistoryFromPush.mockClear(); + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + describe("when enable_sse_history_updates is true (SSE scenario)", () => { + beforeEach(() => { + registerDefaultHandlers({ enableSse: true }); + }); + + it("primes the store with one initial load, connects SSE, and does not keep polling", async () => { + const store = useHistoryStore(); + + // `startWatchingHistory` is an exported alias for + // `startWatchingHistoryWithSSE` (see historyStore.ts). + store.startWatchingHistory(); + + // Config loads async; let the watch fire. + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + + expect(mockSseConnect).toHaveBeenCalledTimes(1); + // One-shot initial fetch so the history panel isn't empty before + // the first SSE event arrives. + expect(mockWatchHistory).toHaveBeenCalledTimes(1); + + // Advance past the short polling interval (3s) several times and + // confirm the polling handler is not invoked a second time in SSE mode. + vi.advanceTimersByTime(30_000); + await flushPromises(); + expect(mockWatchHistory).toHaveBeenCalledTimes(1); + }); + + it("does not start polling when the tab regains visibility", async () => { + const store = useHistoryStore(); + store.startWatchingHistory(); + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + expect(mockWatchHistory).toHaveBeenCalledTimes(1); + + // Simulate a tab hide/show cycle. `useResourceWatcher` registers + // a `visibilitychange` listener whose handler calls + // `startWatchingResourceIfNeeded` — in SSE mode that would + // silently resume polling. Because we never instantiated the + // watcher, no listener should exist and no poll should fire. + Object.defineProperty(document, "visibilityState", { + configurable: true, + get: () => "hidden", + }); + document.dispatchEvent(new Event("visibilitychange")); + Object.defineProperty(document, "visibilityState", { + configurable: true, + get: () => "visible", + }); + document.dispatchEvent(new Event("visibilitychange")); + + await flushPromises(); + vi.advanceTimersByTime(30_000); + await flushPromises(); + expect(mockWatchHistory).toHaveBeenCalledTimes(1); + }); + }); + + describe("when enable_sse_history_updates is false (polling scenario)", () => { + beforeEach(() => { + registerDefaultHandlers({ enableSse: false }); + }); + + it("does not connect SSE and polls on the configured interval", async () => { + const store = useHistoryStore(); + + store.startWatchingHistory(); + + // Let the config load and the initial watch fire. + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + + expect(mockSseConnect).not.toHaveBeenCalled(); + + // The resource watcher invokes the handler immediately on start + // and then re-schedules after each completion. Advance past the + // short interval and confirm repeated invocations. + const initialCalls = mockWatchHistory.mock.calls.length; + expect(initialCalls).toBeGreaterThanOrEqual(1); + + await vi.advanceTimersByTimeAsync(3000); + await flushPromises(); + expect(mockWatchHistory.mock.calls.length).toBeGreaterThan(initialCalls); + }); + + it("calling startWatchingHistory again is idempotent (no second SSE, polling already running)", async () => { + const store = useHistoryStore(); + + store.startWatchingHistory(); + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + + const pollsAfterFirst = mockWatchHistory.mock.calls.length; + + store.startWatchingHistory(); + await flushPromises(); + + expect(mockSseConnect).not.toHaveBeenCalled(); + // Calling again does not schedule an additional independent + // polling loop — the handler should not have been fired an + // extra time by the second call alone. + expect(mockWatchHistory.mock.calls.length).toBe(pollsAfterFirst); + }); + }); +}); diff --git a/client/src/stores/historyStore.ts b/client/src/stores/historyStore.ts index 91d8462f6e98..ec2d91a0728b 100644 --- a/client/src/stores/historyStore.ts +++ b/client/src/stores/historyStore.ts @@ -17,6 +17,7 @@ import { HistoryFilters } from "@/components/History/HistoryFilters"; import { useResourceWatcher } from "@/composables/resourceWatcher"; import { useSSE } from "@/composables/useNotificationSSE"; import { useUserLocalStorage } from "@/composables/userLocalStorage"; +import { useConfigStore } from "@/stores/configurationStore"; import { createAndSelectNewHistory, getCurrentHistoryFromServer, @@ -396,7 +397,7 @@ export const useHistoryStore = defineStore("historyStore", () => { // SSE-driven history updates: when we receive a history_update event, // immediately trigger a refresh of the current history const SSE_HISTORY_EVENT_TYPES = ["history_update"] as const; - const { connect: sseHistoryConnect, connected: sseHistoryConnected } = useSSE( + const { connect: sseHistoryConnect, disconnect: sseHistoryDisconnect } = useSSE( handleHistorySSEEvent, SSE_HISTORY_EVENT_TYPES, ); @@ -418,41 +419,70 @@ export const useHistoryStore = defineStore("historyStore", () => { console.error("Error handling history SSE event:", e); } } - const { - startWatchingResource: startWatchingHistory, - stopWatchingResource: stopWatchingHistory, - isWatchingResource: isWatchingHistory, - } = useResourceWatcher(watchHistory, { - shortPollingInterval: ACTIVE_POLLING_INTERVAL, - longPollingInterval: INACTIVE_POLLING_INTERVAL, - }); - // When the SSE pipeline is live it delivers history_update events directly - // and we stop the 3-second poll. If SSE drops (server unsupported, repeated - // errors, network blip) the watch below resumes polling as the fallback. + // Choose between SSE and polling based on the server config flag + // `enable_sse_history_updates`. SSE success at the socket level is not a + // reliable proxy: the `/api/events/stream` endpoint accepts connections + // even when the HistoryAuditMonitor is disabled, so relying on the + // EventSource `connected` state would silently stop polling without any + // events ever arriving. // - // The public `startWatchingHistory` alias is called from many places (component - // mounts, upload/tool/workflow completion callbacks). We must NOT reconnect SSE - // on every call — that would flap `connected` false→true repeatedly, and - // `onopen` may never stably fire. Initialize SSE exactly once. - let sseInitialized = false; + // `useResourceWatcher` is instantiated lazily because it registers a + // `visibilitychange` listener that calls `startWatchingResourceIfNeeded` + // every time the tab regains focus — in SSE mode that would re-start + // polling we explicitly don't want. + const isWatchingHistory = ref(false); + let watchingInitialized = false; + let stopWatchingHistoryResource: (() => void) | null = null; function startWatchingHistoryWithSSE() { - if (sseInitialized) { + if (watchingInitialized) { return; } - sseInitialized = true; - sseHistoryConnect(); - watch( - sseHistoryConnected, - (isConnected) => { - if (isConnected) { - stopWatchingHistory(); - } else { - startWatchingHistory(); - } - }, - { immediate: true }, - ); + watchingInitialized = true; + + const configStore = useConfigStore(); + const decide = () => { + if (configStore.config?.enable_sse_history_updates) { + // SSE delivers incremental updates only; the store still needs + // a baseline fetch so the history panel isn't empty until the + // first change arrives. + watchHistory().catch((err) => console.warn("Initial history load failed", err)); + sseHistoryConnect(); + } else { + // The resource watcher fires its handler once immediately and + // then re-schedules on the polling interval, which covers the + // initial load as well as ongoing updates. + const { startWatchingResource, stopWatchingResource, isWatchingResource } = useResourceWatcher( + watchHistory, + { + shortPollingInterval: ACTIVE_POLLING_INTERVAL, + longPollingInterval: INACTIVE_POLLING_INTERVAL, + }, + ); + stopWatchingHistoryResource = stopWatchingResource; + watch(isWatchingResource, (v) => (isWatchingHistory.value = v), { immediate: true }); + startWatchingResource(); + } + }; + + if (configStore.isLoaded) { + decide(); + } else { + const stop = watch( + () => configStore.isLoaded, + (loaded) => { + if (loaded) { + stop(); + decide(); + } + }, + ); + } + } + + function stopWatchingHistory() { + sseHistoryDisconnect(); + stopWatchingHistoryResource?.(); } async function loadHistoryById(historyId: string) { diff --git a/client/src/stores/notificationsStore.test.ts b/client/src/stores/notificationsStore.test.ts new file mode 100644 index 000000000000..4b7d8545cdab --- /dev/null +++ b/client/src/stores/notificationsStore.test.ts @@ -0,0 +1,196 @@ +import flushPromises from "flush-promises"; +import { createPinia, setActivePinia } from "pinia"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { ref } from "vue"; + +import { useServerMock } from "@/api/client/__mocks__"; +import type { UserNotification } from "@/api/notifications"; + +import { useNotificationsStore } from "./notificationsStore"; + +// Capture SSE composable usage without opening a real EventSource. +// The returned `connected` ref stays false by default — this is intentional +// because the store must NOT be relying on it; the decision is config-driven. +const mockSseConnect = vi.fn(); +const mockSseDisconnect = vi.fn(); +const mockSseConnected = ref(false); + +vi.mock("@/composables/useNotificationSSE", () => ({ + useSSE: vi.fn(() => ({ + connect: mockSseConnect, + disconnect: mockSseDisconnect, + connected: mockSseConnected, + })), +})); + +// Realistic fixture: a single unread notification, as returned by +// GET /api/notifications. Shape mirrors UserNotification. +function makeNotificationFixture(overrides: Partial = {}): UserNotification { + return { + id: "notif-1", + source: "galaxy_test", + category: "message", + variant: "info", + create_time: "2026-01-01T00:00:00", + update_time: "2026-01-01T00:00:00", + publication_time: "2026-01-01T00:00:00", + expiration_time: null, + seen_time: null, + deleted: false, + content: { subject: "hello", message: "welcome" }, + ...overrides, + } as UserNotification; +} + +const SCENARIO_NOTIFICATION = makeNotificationFixture(); +const SCENARIO_STATUS_SINCE = { + total_unread_count: 1, + notifications: [SCENARIO_NOTIFICATION], + broadcasts: [], +}; + +const { server, http } = useServerMock(); + +function configResponse(overrides: Record) { + // The /api/configuration response carries many fields; the store only + // reads enable_notification_system, so a minimal object suffices. + return { enable_notification_system: false, ...overrides }; +} + +function registerDefaultHandlers({ enableNotificationSystem }: { enableNotificationSystem: boolean }) { + server.use( + http.get("/api/configuration", ({ response }) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return response(200).json(configResponse({ enable_notification_system: enableNotificationSystem }) as any); + }), + http.get("/api/notifications", ({ response }) => { + return response(200).json([SCENARIO_NOTIFICATION]); + }), + http.get("/api/notifications/broadcast", ({ response }) => { + return response(200).json([]); + }), + http.get("/api/notifications/status", ({ response }) => { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return response(200).json(SCENARIO_STATUS_SINCE as any); + }), + ); +} + +describe("notificationsStore — config-driven SSE vs polling", () => { + beforeEach(() => { + setActivePinia(createPinia()); + mockSseConnect.mockClear(); + mockSseDisconnect.mockClear(); + mockSseConnected.value = false; + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + describe("when enable_notification_system is true (SSE scenario)", () => { + beforeEach(() => { + registerDefaultHandlers({ enableNotificationSystem: true }); + }); + + it("connects SSE and does not poll the status endpoint", async () => { + const store = useNotificationsStore(); + + // The store fires an initial load (GET /api/notifications + broadcasts) + // and then decides SSE vs polling based on the config flag. + const statusSpy = vi.fn(); + server.use( + http.get("/api/notifications/status", ({ response }) => { + statusSpy(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return response(200).json(SCENARIO_STATUS_SINCE as any); + }), + ); + + await vi.runOnlyPendingTimersAsync(); + await store.startWatchingNotifications(); + await flushPromises(); + + // The config load is async — let the watch fire. + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + + expect(mockSseConnect).toHaveBeenCalledTimes(1); + + // Advance well past the polling interval (30s) and confirm + // the status endpoint is never polled while SSE is the active channel. + vi.advanceTimersByTime(120_000); + await flushPromises(); + expect(statusSpy).not.toHaveBeenCalled(); + }); + + it("does not start polling when the tab regains visibility", async () => { + const store = useNotificationsStore(); + const statusSpy = vi.fn(); + server.use( + http.get("/api/notifications/status", ({ response }) => { + statusSpy(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return response(200).json(SCENARIO_STATUS_SINCE as any); + }), + ); + + await vi.runOnlyPendingTimersAsync(); + await store.startWatchingNotifications(); + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + + // Tab hide/show cycle — must not trigger the status endpoint. + Object.defineProperty(document, "visibilityState", { + configurable: true, + get: () => "hidden", + }); + document.dispatchEvent(new Event("visibilitychange")); + Object.defineProperty(document, "visibilityState", { + configurable: true, + get: () => "visible", + }); + document.dispatchEvent(new Event("visibilitychange")); + + await flushPromises(); + vi.advanceTimersByTime(120_000); + await flushPromises(); + expect(statusSpy).not.toHaveBeenCalled(); + }); + }); + + describe("when enable_notification_system is false (polling scenario)", () => { + beforeEach(() => { + registerDefaultHandlers({ enableNotificationSystem: false }); + }); + + it("does not connect SSE and polls the status endpoint on the configured interval", async () => { + const store = useNotificationsStore(); + + const statusSpy = vi.fn(); + server.use( + http.get("/api/notifications/status", ({ response }) => { + statusSpy(); + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return response(200).json(SCENARIO_STATUS_SINCE as any); + }), + ); + + await vi.runOnlyPendingTimersAsync(); + await store.startWatchingNotifications(); + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); + + expect(mockSseConnect).not.toHaveBeenCalled(); + + // Advance past the short polling interval (30s) and confirm + // the status endpoint is hit by the resource watcher. + vi.advanceTimersByTime(30_000); + await flushPromises(); + expect(statusSpy).toHaveBeenCalled(); + }); + }); +}); diff --git a/client/src/stores/notificationsStore.ts b/client/src/stores/notificationsStore.ts index 220bd8e796ac..fae40d7f0e02 100644 --- a/client/src/stores/notificationsStore.ts +++ b/client/src/stores/notificationsStore.ts @@ -5,6 +5,7 @@ import { GalaxyApi } from "@/api"; import type { NotificationChanges, UserNotification, UserNotificationsBatchUpdateRequest } from "@/api/notifications"; import { useResourceWatcher } from "@/composables/resourceWatcher"; import { useSSE } from "@/composables/useNotificationSSE"; +import { useConfigStore } from "@/stores/configurationStore"; import { rethrowSimple } from "@/utils/simple-error"; import { mergeObjectListsById } from "@/utils/utils"; @@ -21,47 +22,12 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { const loadingNotifications = ref(false); const lastNotificationUpdate = ref(null); - const wantSSE = ref(true); const unreadNotifications = computed(() => notifications.value.filter((n) => !n.seen_time)); // --- SSE setup (listen only for notification event types) --- const NOTIFICATION_EVENT_TYPES = ["notification_update", "broadcast_update", "notification_status"] as const; - const { - connect: sseConnect, - disconnect: sseDisconnect, - connected: sseConnected, - } = useSSE(handleSSEEvent, NOTIFICATION_EVENT_TYPES); - - // --- Polling fallback --- - const { startWatchingResource: startPolling, stopWatchingResource: stopPolling } = useResourceWatcher( - getNotificationStatus, - { - shortPollingInterval: ACTIVE_POLLING_INTERVAL, - longPollingInterval: INACTIVE_POLLING_INTERVAL, - }, - ); - - function stopWatchingNotifications() { - sseDisconnect(); - stopPolling(); - } - - // When SSE connection drops and doesn't recover, fall back to polling - watch(sseConnected, (isConnected) => { - if (!isConnected && wantSSE.value) { - // SSE disconnected but we still want updates — don't start polling - // immediately, EventSource will auto-reconnect. Only if useSSE is - // set to false (after too many errors) do we fall back. - } - }); - - watch(wantSSE, (wantSSE) => { - if (!wantSSE) { - sseDisconnect(); - startPolling(); - } - }); + const { connect: sseConnect, disconnect: sseDisconnect } = useSSE(handleSSEEvent, NOTIFICATION_EVENT_TYPES); function handleSSEEvent(event: MessageEvent) { try { @@ -145,6 +111,55 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { } } + // Choose between SSE and polling based on the server config flag + // `enable_notification_system`. The `/api/events/stream` endpoint accepts + // connections regardless of the flag, so we cannot rely on EventSource + // connectivity to decide — config is the source of truth. + // + // `useResourceWatcher` is instantiated lazily because it registers a + // `visibilitychange` listener that calls `startWatchingResourceIfNeeded` + // every time the tab regains focus — in SSE mode that would re-start + // polling we explicitly don't want. + let watchingInitialized = false; + let stopPolling: (() => void) | null = null; + function ensureWatchingWithConfig() { + if (watchingInitialized) { + return; + } + watchingInitialized = true; + + const configStore = useConfigStore(); + const decide = () => { + if (configStore.config?.enable_notification_system) { + sseConnect(); + } else { + const { startWatchingResource: startPollingResource, stopWatchingResource } = useResourceWatcher( + getNotificationStatus, + { + shortPollingInterval: ACTIVE_POLLING_INTERVAL, + longPollingInterval: INACTIVE_POLLING_INTERVAL, + }, + ); + stopPolling = stopWatchingResource; + startPollingResource(); + } + }; + + if (configStore.isLoaded) { + decide(); + } else { + const stop = watch( + () => configStore.isLoaded, + (loaded) => { + if (loaded) { + stop(); + decide(); + } + }, + ); + } + } + async function startWatchingNotifications() { // Always do an initial load first if (!lastNotificationUpdate.value) { @@ -161,11 +176,7 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { } } - if (wantSSE.value) { - sseConnect(); - } else { - startPolling(); - } + ensureWatchingWithConfig(); } async function updateBatchNotification(request: UserNotificationsBatchUpdateRequest) { @@ -180,8 +191,9 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { if (request.changes.deleted) { notifications.value = notifications.value.filter((n) => !request.notification_ids.includes(n.id)); } - // If not using SSE, trigger a poll to refresh state - if (!sseConnected.value) { + // If the notification system (and therefore SSE) is disabled, trigger + // a poll to refresh state after a local mutation. + if (!useConfigStore().config?.enable_notification_system) { startWatchingNotifications(); } } @@ -194,6 +206,11 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { totalUnreadCount.value = notifications.value.filter((n) => !n.seen_time).length; } + function stopWatchingNotifications() { + sseDisconnect(); + stopPolling?.(); + } + return { notifications, totalUnreadCount, diff --git a/lib/galaxy/managers/configuration.py b/lib/galaxy/managers/configuration.py index bf49cce94d59..9d7647e2fb54 100644 --- a/lib/galaxy/managers/configuration.py +++ b/lib/galaxy/managers/configuration.py @@ -229,6 +229,7 @@ def _config_is_truthy(item, key, **context): "tool_training_recommendations_link": _use_config, "tool_training_recommendations_api_url": _use_config, "enable_notification_system": _use_config, + "enable_sse_history_updates": _use_config, "instance_resource_url": _use_config, "instance_access_url": _use_config, "organization_name": _use_config, From 973dba5c83b57a69d08ae7896b5b4f286dcccde7 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 14:45:23 +0200 Subject: [PATCH 07/47] Skip response buffering for non-HTML in galaxy-dev-server plugin The Vite dev-server plugin overrode `res.write`/`res.end` on every response to collect the body and rewrite `bundled.js` script tags in HTML on end(). For `text/event-stream` the response never ends, so SSE frames sat in the buffer indefinitely and clients reached through `http://localhost:5173/` saw no events. Decide on first write: buffer only when Content-Type is text/html (the only case the transform applies to); pass write/end straight through for JSON, binary, and streaming responses. --- client/vite-plugin-galaxy-dev-server.js | 71 +++++++++++++++---------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/client/vite-plugin-galaxy-dev-server.js b/client/vite-plugin-galaxy-dev-server.js index b46e13defa64..e2ca53831713 100644 --- a/client/vite-plugin-galaxy-dev-server.js +++ b/client/vite-plugin-galaxy-dev-server.js @@ -99,16 +99,33 @@ export function galaxyDevServerPlugin() { const originalWrite = res.write.bind(res); const originalEnd = res.end.bind(res); - // Buffer to collect response body + // Buffer to collect response body. We only buffer when the + // upstream response is HTML; everything else (JSON, binary, + // and crucially `text/event-stream`) must pass straight + // through, because streaming responses never call + // `res.end()` and would otherwise stall indefinitely. const chunks = []; - let isHtml = false; + // Tri-state: null = undecided (first write hasn't landed yet), + // true = stream it through untransformed, + // false = buffer for HTML rewrite on end(). + let passthrough = null; + + function decidePassthrough() { + if (passthrough !== null) { + return; + } + const contentType = res.getHeader("content-type"); + passthrough = !contentType || !contentType.toString().includes("text/html"); + } - // Override write to collect chunks res.write = function (chunk, encoding, callback) { + decidePassthrough(); + if (passthrough) { + return originalWrite(chunk, encoding, callback); + } if (chunk) { chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk, encoding)); } - // Don't write yet - we'll write in end() if (typeof encoding === "function") { encoding(); // encoding is actually the callback } else if (typeof callback === "function") { @@ -119,41 +136,39 @@ export function galaxyDevServerPlugin() { // Override end to transform and send response res.end = function (chunk, encoding, callback) { + decidePassthrough(); + if (passthrough) { + return originalEnd(chunk, encoding, callback); + } + if (chunk) { chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk, encoding)); } - // Check content type - const contentType = res.getHeader("content-type"); - isHtml = contentType && contentType.toString().includes("text/html"); - // Combine all chunks let body = Buffer.concat(chunks); - // Transform HTML responses that contain Galaxy bundles - if (isHtml) { - // Decompress gzip responses (common from remote Galaxy servers) - const contentEncoding = res.getHeader("content-encoding"); - if (contentEncoding === "gzip") { - try { - body = gunzipSync(body); - } catch (e) { - // If decompression fails, continue with original body - console.warn("[galaxy-dev-server] Failed to decompress gzip response:", e.message); - } + // Decompress gzip responses (common from remote Galaxy servers) + const contentEncoding = res.getHeader("content-encoding"); + if (contentEncoding === "gzip") { + try { + body = gunzipSync(body); + } catch (e) { + // If decompression fails, continue with original body + console.warn("[galaxy-dev-server] Failed to decompress gzip response:", e.message); } + } - let htmlString = body.toString("utf-8"); - if (htmlString.includes("bundled.js") || htmlString.includes("/static/dist/")) { - htmlString = transformGalaxyHtml(htmlString); - body = Buffer.from(htmlString, "utf-8"); + let htmlString = body.toString("utf-8"); + if (htmlString.includes("bundled.js") || htmlString.includes("/static/dist/")) { + htmlString = transformGalaxyHtml(htmlString); + body = Buffer.from(htmlString, "utf-8"); - // Update content-length header - res.setHeader("content-length", body.length); + // Update content-length header + res.setHeader("content-length", body.length); - // Remove content-encoding since we've decompressed it - res.removeHeader("content-encoding"); - } + // Remove content-encoding since we've decompressed it + res.removeHeader("content-encoding"); } // Send the response From 3cf8a510185adf1992db30edf9e81042e19cf2ec Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 15:18:29 +0200 Subject: [PATCH 08/47] Align history-audit trigger SQL between migration and runtime installer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The follow-up pg_notify migration b8d5e2f9a1c7 and runtime installer update_audit_table.py carried two independent copies of the plpgsql trigger function body, and disagreed on the STATEMENT-vs-ROW threshold: - c716ee82337b (last trigger migration) used `version > 10`, so on PG 10 it installed a ROW trigger + ROW-variant function. - b8d5e2f9a1c7 used `>= 10`, so on PG 10 its upgrade() overwrote the function body with the STATEMENT variant (references `new_table`) while the trigger left behind by c716ee82337b was still ROW — the next audit write would raise "missing FROM-clause entry for new_table". - update_audit_table.py (fresh install) used `>= 10`, which is internally consistent for fresh installs but diverges from the upgrade path. Extract `build_trigger_fn(function_name, id_field, use_statement, with_notify)` and `use_statement_trigger(version)` from update_audit_table.py, and have the migration import them. Now both paths share one SQL body and one predicate, and the predicate matches c716ee82337b so upgraded PG 10 databases keep a consistent ROW trigger + ROW-variant function. (PG 10 is long past EOL, but drift between runtime install and migration is exactly the class of bug this migration was written to repair.) --- ...add_pg_notify_to_history_audit_triggers.py | 70 ++-------- .../model/triggers/update_audit_table.py | 120 ++++++++++-------- 2 files changed, 83 insertions(+), 107 deletions(-) diff --git a/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py b/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py index 6e410a62e5b1..0fe49572c790 100644 --- a/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py +++ b/lib/galaxy/model/migrations/alembic/versions_gxy/b8d5e2f9a1c7_add_pg_notify_to_history_audit_triggers.py @@ -14,6 +14,10 @@ affected history id, matching `model/triggers/update_audit_table.py` used on fresh installs. +The STATEMENT-vs-ROW decision must match the trigger DEFINITION installed by +`c716ee82337b` so the function body references the right context (`new_table` +vs `NEW`); both use `version > 10` (and treat offline mode as STATEMENT). + SQLite installations use the poll-only path and require no change. """ @@ -23,6 +27,11 @@ _is_sqlite, transaction, ) +from galaxy.model.triggers.update_audit_table import ( + build_trigger_fn, + fn_prefix, + use_statement_trigger, +) revision = "b8d5e2f9a1c7" down_revision = "f5e9e4bca542" @@ -30,9 +39,6 @@ depends_on = None -CHANNEL = "galaxy_history_update" - - def upgrade(): if _is_sqlite(): return @@ -47,58 +53,10 @@ def downgrade(): _install_functions(with_notify=False) -def _install_functions(with_notify: bool): +def _install_functions(with_notify: bool) -> None: version_info = op.get_bind().engine.dialect.server_version_info - use_statement = version_info is None or version_info[0] >= 10 - builder = _statement_trigger_fn if use_statement else _row_trigger_fn + # Offline mode (no live connection) matches c716ee82337b: assume STATEMENT. + statement = version_info is None or use_statement_trigger(version_info[0]) for id_field in ("history_id", "id"): - op.execute(builder(f"fn_audit_history_by_{id_field}", id_field, with_notify)) - - -def _statement_trigger_fn(function_name: str, id_field: str, with_notify: bool) -> str: - notify_block = ( - f""" - FOR _history_id IN SELECT DISTINCT {id_field} FROM new_table WHERE {id_field} IS NOT NULL - LOOP - PERFORM pg_notify('{CHANNEL}', _history_id::text); - END LOOP; - """ - if with_notify - else "" - ) - declare_block = "DECLARE _history_id integer;" if with_notify else "" - return f""" - CREATE OR REPLACE FUNCTION {function_name}() - RETURNS TRIGGER - LANGUAGE 'plpgsql' - AS $BODY$ - {declare_block} - BEGIN - INSERT INTO history_audit (history_id, update_time) - SELECT DISTINCT {id_field}, clock_timestamp() AT TIME ZONE 'UTC' - FROM new_table - WHERE {id_field} IS NOT NULL - ON CONFLICT DO NOTHING; - {notify_block} - RETURN NULL; - END; - $BODY$ - """ - - -def _row_trigger_fn(function_name: str, id_field: str, with_notify: bool) -> str: - notify_stmt = f"PERFORM pg_notify('{CHANNEL}', NEW.{id_field}::text);" if with_notify else "" - return f""" - CREATE OR REPLACE FUNCTION {function_name}() - RETURNS TRIGGER - LANGUAGE 'plpgsql' - AS $BODY$ - BEGIN - INSERT INTO history_audit (history_id, update_time) - VALUES (NEW.{id_field}, clock_timestamp() AT TIME ZONE 'UTC') - ON CONFLICT DO NOTHING; - {notify_stmt} - RETURN NULL; - END; - $BODY$ - """ + fn_name = f"{fn_prefix}_{id_field}" + op.execute(build_trigger_fn(fn_name, id_field, use_statement=statement, with_notify=with_notify)) diff --git a/lib/galaxy/model/triggers/update_audit_table.py b/lib/galaxy/model/triggers/update_audit_table.py index 9ce71e132a71..be35d85da816 100644 --- a/lib/galaxy/model/triggers/update_audit_table.py +++ b/lib/galaxy/model/triggers/update_audit_table.py @@ -3,6 +3,9 @@ # function name prefix fn_prefix = "fn_audit_history_by" +# channel used by pg_notify so HistoryAuditMonitor can LISTEN for updates +NOTIFY_CHANNEL = "galaxy_history_update" + # map between source table and associated incoming id field trigger_config = { "history_dataset_association": "history_id", @@ -11,6 +14,68 @@ } +def use_statement_trigger(version: int) -> bool: + """Return True when the postgres version supports the STATEMENT variant. + + Fresh installs and the pg_notify migration share this predicate to ensure + the trigger function body (STATEMENT references new_table, ROW references NEW) + matches the trigger definition installed at that version. + """ + return version > 10 + + +def build_trigger_fn(function_name: str, id_field: str, *, use_statement: bool, with_notify: bool = True) -> str: + """Build the plpgsql CREATE OR REPLACE FUNCTION body for an audit trigger. + + Shared between runtime install (update_audit_table.install) and alembic + migrations so the two cannot drift. + """ + if use_statement: + notify_block = ( + f""" + FOR _history_id IN SELECT DISTINCT {id_field} FROM new_table WHERE {id_field} IS NOT NULL + LOOP + PERFORM pg_notify('{NOTIFY_CHANNEL}', _history_id::text); + END LOOP; + """ + if with_notify + else "" + ) + declare_block = "DECLARE _history_id integer;" if with_notify else "" + return f""" + CREATE OR REPLACE FUNCTION {function_name}() + RETURNS TRIGGER + LANGUAGE 'plpgsql' + AS $BODY$ + {declare_block} + BEGIN + INSERT INTO history_audit (history_id, update_time) + SELECT DISTINCT {id_field}, clock_timestamp() AT TIME ZONE 'UTC' + FROM new_table + WHERE {id_field} IS NOT NULL + ON CONFLICT DO NOTHING; + {notify_block} + RETURN NULL; + END; + $BODY$ + """ + notify_stmt = f"PERFORM pg_notify('{NOTIFY_CHANNEL}', NEW.{id_field}::text);" if with_notify else "" + return f""" + CREATE OR REPLACE FUNCTION {function_name}() + RETURNS TRIGGER + LANGUAGE 'plpgsql' + AS $BODY$ + BEGIN + INSERT INTO history_audit (history_id, update_time) + VALUES (NEW.{id_field}, clock_timestamp() AT TIME ZONE 'UTC') + ON CONFLICT DO NOTHING; + {notify_stmt} + RETURN NULL; + END; + $BODY$ + """ + + def install(engine): """Install history audit table triggers""" sql = _postgres_install(engine) if "postgres" in engine.name else _sqlite_install() @@ -41,54 +106,6 @@ def _postgres_install(engine): sql = [] - # PostgreSQL trigger function template - # need to make separate functions purely because the incoming history_id field name will be - # different for different source tables. There may be a fancier way to dynamically choose - # between incoming fields, but having 2 triggers fns seems straightforward - - def statement_trigger_fn(id_field): - fn = f"{fn_prefix}_{id_field}" - - return f""" - CREATE OR REPLACE FUNCTION {fn}() - RETURNS TRIGGER - LANGUAGE 'plpgsql' - AS $BODY$ - DECLARE - _history_id integer; - BEGIN - INSERT INTO history_audit (history_id, update_time) - SELECT DISTINCT {id_field}, clock_timestamp() AT TIME ZONE 'UTC' - FROM new_table - WHERE {id_field} IS NOT NULL - ON CONFLICT DO NOTHING; - FOR _history_id IN SELECT DISTINCT {id_field} FROM new_table WHERE {id_field} IS NOT NULL - LOOP - PERFORM pg_notify('galaxy_history_update', _history_id::text); - END LOOP; - RETURN NULL; - END; - $BODY$ - """ - - def row_trigger_fn(id_field): - fn = f"{fn_prefix}_{id_field}" - - return f""" - CREATE OR REPLACE FUNCTION {fn}() - RETURNS TRIGGER - LANGUAGE 'plpgsql' - AS $BODY$ - BEGIN - INSERT INTO history_audit (history_id, update_time) - VALUES (NEW.{id_field}, clock_timestamp() AT TIME ZONE 'UTC') - ON CONFLICT DO NOTHING; - PERFORM pg_notify('galaxy_history_update', NEW.{id_field}::text); - RETURN NULL; - END; - $BODY$ - """ - def trigger_def(source_table: str, id_field: str, operation: str, version: int, when: str = "AFTER") -> str: fn = f"{fn_prefix}_{id_field}" # PostgreSQL supports many triggers per operation/table so the label can @@ -100,7 +117,7 @@ def trigger_def(source_table: str, id_field: str, operation: str, version: int, # The use of the keyword PROCEDURE here is historical and deprecated (https://www.postgresql.org/docs/11/sql-createtrigger.html). function_keyword = "FUNCTION" if version >= 11 else "PROCEDURE" create_or_replace = "CREATE OR REPLACE" if version >= 14 else "CREATE" - if version >= 10 and when == "AFTER": + if use_statement_trigger(version) and when == "AFTER": return f""" {create_or_replace} TRIGGER {trigger_name} AFTER {operation} @@ -121,10 +138,11 @@ def trigger_def(source_table: str, id_field: str, operation: str, version: int, # pick row or statement triggers depending on postgres version version = engine.dialect.server_version_info[0] - trigger_fn = statement_trigger_fn if version >= 10 else row_trigger_fn + statement = use_statement_trigger(version) for id_field in ["history_id", "id"]: - sql.append(trigger_fn(id_field)) + fn_name = f"{fn_prefix}_{id_field}" + sql.append(build_trigger_fn(fn_name, id_field, use_statement=statement, with_notify=True)) for source_table, id_field in trigger_config.items(): for operation in ["UPDATE", "INSERT"]: From 8d12bff9b922dece40d0a2b303f1b4d3b2efce55 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 15:19:07 +0200 Subject: [PATCH 09/47] Address review feedback for SSE notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend - Narrow SSEEventDispatcher's dependencies. Previous constructor took the whole app and reached for `queue_worker`, `application_stack`, `config.server_name` off it (via send_control_task) — a service- locator pattern where the `MinimalManagerApp` annotation didn't even cover the accessed fields. Inject `queue_worker: Optional[ GalaxyQueueWorker]` and `application_stack: ApplicationStack` directly and publish via `ControlTask(qw).send_task(...)` so the dispatcher is unit-testable without a full app. - Move user-id resolution, notifications-enabled guard, and catch-up wiring out of `api/events.py` and `api/notifications.py`. Controllers are now pure `StreamingResponse` wrappers around `EventsService.open_stream(...)` / `NotificationService.open_stream (...)`, matching the three-layer architecture. - Drop the starlette `Request` leak from `managers/sse.py`. `SSEConnectionManager.stream` now takes an `IsDisconnected` (async callable) so the manager stays framework-agnostic; services pass `request.is_disconnected` in. - Centralise SSE event-id generation. New `make_event_id()` / `parse_event_id()` helpers in `managers/sse.py` use `galaxy.model.orm.now` (timezone-naive UTC, matching the rest of Galaxy's DB timestamps) instead of `datetime.utcnow()` scattered across three callers — previous code risked a tz-aware/naïve mix that would silently drop Last-Event-ID catch-up. - Drop dead `app.sse_connection_manager` attribute and `StructuredApp.sse_connection_manager` field. Nothing reads them; the manager is consumed via Lagom (`depends(SSEConnectionManager)` / `app[SSEConnectionManager]`). - Type-annotate the public control-task surface: `send_control_task`, `ControlTask.send_task`, `all_control_queues_for_declare`, SSE control-task handlers (`notify_users` / `notify_broadcast` / `history_update`), and both new stream endpoints. Tighten `SSEEventDispatcher.history_update` to `dict[str, list[int]]`. Tests - Replace the substring `data` matches with `json.loads(data)["content"] ["subject"]` so a regression in the NotificationResponse envelope (renamed keys, missing fields) actually fails. Confirmed shape against `NotificationResponse.model_dump_json()`. - Assert the *absence* of pre-Last-Event-ID subjects in the catch-up test. Without this, a server that replays everything on every reconnect silently passes. - Delete smoke tests that only checked content-type + 200 (subsumed by the functional tests) and the two "existing polling API still works" tests that didn't exercise SSE at all. - Add `window.__galaxy_sse_last_event_ts` to the `useSSE` composable and gate the selenium bell/notification tests on it advancing past a baseline. The previous 15s wait would silently become a polling test if the poll interval ever dropped — this makes "update came from SSE" a positive assertion. - Vitest: add a shared `_testing/sseStoreSupport.ts` helper for the two store tests to share a `useSSE` mock that captures the `onEvent` callback. Use it to synthesize real `MessageEvent`s and assert store-state changes for `notification_update`, `notification_status`, and `history_update` — the most consequential handlers were previously uncovered. Also save/restore `document.visibilityState` so the patching doesn't leak across tests, drop the dead `mockSseConnected` ref, tighten the idempotency assertion to advance time by one interval, and collapse the repeated orchestration into a `primeStore(...)` helper. --- client/src/composables/useNotificationSSE.ts | 27 ++- client/src/stores/_testing/sseStoreSupport.ts | 68 ++++++++ client/src/stores/historyStore.test.ts | 141 ++++++++-------- client/src/stores/notificationsStore.test.ts | 156 ++++++++---------- lib/galaxy/app/__init__.py | 14 +- lib/galaxy/managers/sse.py | 39 ++++- lib/galaxy/managers/sse_dispatch.py | 53 +++--- lib/galaxy/queue_worker/__init__.py | 32 ++-- lib/galaxy/queues/__init__.py | 10 +- lib/galaxy/structured_app/__init__.py | 2 - lib/galaxy/webapps/galaxy/api/events.py | 12 +- .../webapps/galaxy/api/notifications.py | 9 +- lib/galaxy/webapps/galaxy/services/events.py | 38 +++++ .../webapps/galaxy/services/notifications.py | 35 +++- test/integration/test_history_sse.py | 37 ----- test/integration/test_notification_sse.py | 83 ++++------ .../test_notification_sse.py | 33 +++- 17 files changed, 470 insertions(+), 319 deletions(-) create mode 100644 client/src/stores/_testing/sseStoreSupport.ts create mode 100644 lib/galaxy/webapps/galaxy/services/events.py diff --git a/client/src/composables/useNotificationSSE.ts b/client/src/composables/useNotificationSSE.ts index 088d3d1bf054..b0d34d8e8a5e 100644 --- a/client/src/composables/useNotificationSSE.ts +++ b/client/src/composables/useNotificationSSE.ts @@ -14,6 +14,15 @@ export const SSE_EVENT_TYPES = [ export type SSEEventType = (typeof SSE_EVENT_TYPES)[number]; +interface SSEDebugGlobals { + __galaxy_sse_connected?: boolean; + __galaxy_sse_last_event_ts?: number; +} + +function sseGlobals(): SSEDebugGlobals { + return window as unknown as SSEDebugGlobals; +} + /** * Composable for connecting to the unified SSE event stream. * @@ -27,20 +36,28 @@ export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: reado const connected = ref(false); let eventSource: EventSource | null = null; + // Selenium tests watch __galaxy_sse_last_event_ts to prove that an + // observable state change came from an SSE push and not the polling + // fallback (where __galaxy_sse_last_event_ts would never advance). + const trackedOnEvent = (event: MessageEvent) => { + sseGlobals().__galaxy_sse_last_event_ts = Date.now(); + onEvent(event); + }; + function connect() { disconnect(); const url = withPrefix("/api/events/stream"); eventSource = new EventSource(url); for (const eventType of eventTypes) { - eventSource.addEventListener(eventType, onEvent); + eventSource.addEventListener(eventType, trackedOnEvent); } eventSource.onopen = () => { connected.value = true; // Expose a global readiness flag so Selenium tests can distinguish // a working SSE pipeline from the polling fallback. - (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = true; + sseGlobals().__galaxy_sse_connected = true; }; eventSource.onerror = () => { @@ -49,20 +66,20 @@ export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: reado // so we must not give up on transient errors here — doing so // would leave the client with no updates at all. connected.value = false; - (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = false; + sseGlobals().__galaxy_sse_connected = false; }; } function disconnect() { if (eventSource) { for (const eventType of eventTypes) { - eventSource.removeEventListener(eventType, onEvent); + eventSource.removeEventListener(eventType, trackedOnEvent); } eventSource.close(); eventSource = null; } connected.value = false; - (window as unknown as { __galaxy_sse_connected?: boolean }).__galaxy_sse_connected = false; + sseGlobals().__galaxy_sse_connected = false; } onScopeDispose(() => { diff --git a/client/src/stores/_testing/sseStoreSupport.ts b/client/src/stores/_testing/sseStoreSupport.ts new file mode 100644 index 000000000000..4613430bb254 --- /dev/null +++ b/client/src/stores/_testing/sseStoreSupport.ts @@ -0,0 +1,68 @@ +/** + * Shared test helpers for the SSE-driven stores (historyStore, notificationsStore). + * + * Both stores consume the same `useSSE` composable and need: + * - a mock that captures the onEvent callback so tests can synthesize SSE messages; + * - visibility-state patching without leaking across tests (JSDOM's `document` + * is shared by every test in the same worker, so an unrestored + * `Object.defineProperty` causes silent bleed). + * + * Because ``vi.mock`` is hoisted above module-level variables, tests must + * construct the SSE-mock state via ``vi.hoisted`` and then hand it to + * ``sseMockFactory`` from inside the ``vi.mock`` factory. See the ``.test.ts`` + * files in this directory for the pattern. + */ + +import { vi } from "vitest"; +import { type Ref, ref } from "vue"; + +export interface SSEMockState { + onEvent: ((event: MessageEvent) => void) | null; + connect: ReturnType; + disconnect: ReturnType; +} + +/** Build the factory used with ``vi.mock("@/composables/useNotificationSSE", ...)``. */ +export function sseMockFactory(state: SSEMockState) { + return { + useSSE: vi.fn((onEvent: (event: MessageEvent) => void) => { + state.onEvent = onEvent; + return { connect: state.connect, disconnect: state.disconnect }; + }), + }; +} + +/** Synthesize an SSE message through the captured handler. */ +export function emitSse(state: SSEMockState, type: string, payload: unknown): void { + if (!state.onEvent) { + throw new Error("useSSE was not called by the store under test — cannot emit an SSE event"); + } + state.onEvent(new MessageEvent(type, { data: JSON.stringify(payload) })); +} + +/** + * Save the current ``document.visibilityState`` descriptor and return a restorer. + * Call the restorer in ``afterEach`` to prevent patching from leaking into later tests. + */ +export function useVisibilityPatch(): { + set: (state: "visible" | "hidden") => void; + restore: () => void; +} { + const original = Object.getOwnPropertyDescriptor(document, "visibilityState"); + return { + set(state: "visible" | "hidden") { + Object.defineProperty(document, "visibilityState", { + configurable: true, + get: () => state, + }); + document.dispatchEvent(new Event("visibilitychange")); + }, + restore() { + if (original) { + Object.defineProperty(document, "visibilityState", original); + } else { + delete (document as unknown as Record).visibilityState; + } + }, + }; +} diff --git a/client/src/stores/historyStore.test.ts b/client/src/stores/historyStore.test.ts index 2cf0cecbe24a..872233ed7de0 100644 --- a/client/src/stores/historyStore.test.ts +++ b/client/src/stores/historyStore.test.ts @@ -1,26 +1,23 @@ import flushPromises from "flush-promises"; import { createPinia, setActivePinia } from "pinia"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { ref } from "vue"; import { useServerMock } from "@/api/client/__mocks__"; +import { emitSse, sseMockFactory, useVisibilityPatch } from "./_testing/sseStoreSupport"; import { useHistoryStore } from "./historyStore"; -// Capture SSE composable usage — neither test should rely on the real -// EventSource. The `connected` ref stays false by default; the store must -// NOT key polling behavior off it. -const mockSseConnect = vi.fn(); -const mockSseDisconnect = vi.fn(); -const mockSseConnected = ref(false); - -vi.mock("@/composables/useNotificationSSE", () => ({ - useSSE: vi.fn(() => ({ - connect: mockSseConnect, - disconnect: mockSseDisconnect, - connected: mockSseConnected, - })), -})); +// ``vi.mock`` is hoisted above module-level ``const`` declarations, so the +// capture-state has to be built via ``vi.hoisted`` to be visible to the factory. +const sseState = vi.hoisted(() => { + return { + onEvent: null as ((event: MessageEvent) => void) | null, + connect: vi.fn(), + disconnect: vi.fn(), + }; +}); + +vi.mock("@/composables/useNotificationSSE", () => sseMockFactory(sseState)); // `watchHistory(app)` is the polling handler invoked on the short/long // interval. We mock it so each invocation is observable without pulling in @@ -40,31 +37,39 @@ vi.mock("@/app", () => ({ const { server, http } = useServerMock(); -function configResponse(enableSse: boolean) { - return { enable_sse_history_updates: enableSse }; -} - function registerDefaultHandlers({ enableSse }: { enableSse: boolean }) { server.use( http.get("/api/configuration", ({ response }) => { // eslint-disable-next-line @typescript-eslint/no-explicit-any - return response(200).json(configResponse(enableSse) as any); + return response(200).json({ enable_sse_history_updates: enableSse } as any); }), ); } +async function primeStore(startFn: () => void): Promise { + startFn(); + // Config load is async; let the watch fire and the initial fetch complete. + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); +} + describe("historyStore — config-driven SSE vs polling", () => { + let visibility: ReturnType; + beforeEach(() => { setActivePinia(createPinia()); - mockSseConnect.mockClear(); - mockSseDisconnect.mockClear(); - mockSseConnected.value = false; + sseState.connect.mockClear(); + sseState.disconnect.mockClear(); + sseState.onEvent = null; mockWatchHistory.mockClear(); mockRefreshHistoryFromPush.mockClear(); vi.useFakeTimers(); + visibility = useVisibilityPatch(); }); afterEach(() => { + visibility.restore(); vi.useRealTimers(); }); @@ -75,17 +80,9 @@ describe("historyStore — config-driven SSE vs polling", () => { it("primes the store with one initial load, connects SSE, and does not keep polling", async () => { const store = useHistoryStore(); + await primeStore(() => store.startWatchingHistory()); - // `startWatchingHistory` is an exported alias for - // `startWatchingHistoryWithSSE` (see historyStore.ts). - store.startWatchingHistory(); - - // Config loads async; let the watch fire. - await flushPromises(); - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); - - expect(mockSseConnect).toHaveBeenCalledTimes(1); + expect(sseState.connect).toHaveBeenCalledTimes(1); // One-shot initial fetch so the history panel isn't empty before // the first SSE event arrives. expect(mockWatchHistory).toHaveBeenCalledTimes(1); @@ -99,10 +96,7 @@ describe("historyStore — config-driven SSE vs polling", () => { it("does not start polling when the tab regains visibility", async () => { const store = useHistoryStore(); - store.startWatchingHistory(); - await flushPromises(); - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); + await primeStore(() => store.startWatchingHistory()); expect(mockWatchHistory).toHaveBeenCalledTimes(1); // Simulate a tab hide/show cycle. `useResourceWatcher` registers @@ -110,22 +104,44 @@ describe("historyStore — config-driven SSE vs polling", () => { // `startWatchingResourceIfNeeded` — in SSE mode that would // silently resume polling. Because we never instantiated the // watcher, no listener should exist and no poll should fire. - Object.defineProperty(document, "visibilityState", { - configurable: true, - get: () => "hidden", - }); - document.dispatchEvent(new Event("visibilitychange")); - Object.defineProperty(document, "visibilityState", { - configurable: true, - get: () => "visible", - }); - document.dispatchEvent(new Event("visibilitychange")); + visibility.set("hidden"); + visibility.set("visible"); await flushPromises(); vi.advanceTimersByTime(30_000); await flushPromises(); expect(mockWatchHistory).toHaveBeenCalledTimes(1); }); + + it("triggers refreshHistoryFromPush when an SSE event names the current history", async () => { + const store = useHistoryStore(); + await primeStore(() => store.startWatchingHistory()); + // Drive the store to a known current-history id so the handler has + // something to match against. ``currentHistoryId`` is a computed + // that only returns the stored id when the history is present in + // ``storedHistories``, so the history has to be registered too. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + store.setHistory({ id: "hist-1" } as any); + store.setCurrentHistoryId("hist-1"); + + mockRefreshHistoryFromPush.mockClear(); + emitSse(sseState, "history_update", { history_ids: ["hist-1", "hist-2"] }); + await flushPromises(); + + expect(mockRefreshHistoryFromPush).toHaveBeenCalledTimes(1); + }); + + it("ignores SSE history events that do not include the current history", async () => { + const store = useHistoryStore(); + await primeStore(() => store.startWatchingHistory()); + store.setCurrentHistoryId("hist-1"); + + mockRefreshHistoryFromPush.mockClear(); + emitSse(sseState, "history_update", { history_ids: ["hist-2"] }); + await flushPromises(); + + expect(mockRefreshHistoryFromPush).not.toHaveBeenCalled(); + }); }); describe("when enable_sse_history_updates is false (polling scenario)", () => { @@ -135,15 +151,9 @@ describe("historyStore — config-driven SSE vs polling", () => { it("does not connect SSE and polls on the configured interval", async () => { const store = useHistoryStore(); + await primeStore(() => store.startWatchingHistory()); - store.startWatchingHistory(); - - // Let the config load and the initial watch fire. - await flushPromises(); - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); - - expect(mockSseConnect).not.toHaveBeenCalled(); + expect(sseState.connect).not.toHaveBeenCalled(); // The resource watcher invokes the handler immediately on start // and then re-schedules after each completion. Advance past the @@ -156,24 +166,23 @@ describe("historyStore — config-driven SSE vs polling", () => { expect(mockWatchHistory.mock.calls.length).toBeGreaterThan(initialCalls); }); - it("calling startWatchingHistory again is idempotent (no second SSE, polling already running)", async () => { + it("calling startWatchingHistory again is idempotent (no second SSE, polling tick count +1 only)", async () => { const store = useHistoryStore(); - - store.startWatchingHistory(); - await flushPromises(); - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); + await primeStore(() => store.startWatchingHistory()); const pollsAfterFirst = mockWatchHistory.mock.calls.length; store.startWatchingHistory(); await flushPromises(); - expect(mockSseConnect).not.toHaveBeenCalled(); - // Calling again does not schedule an additional independent - // polling loop — the handler should not have been fired an - // extra time by the second call alone. - expect(mockWatchHistory.mock.calls.length).toBe(pollsAfterFirst); + expect(sseState.connect).not.toHaveBeenCalled(); + // Calling again must not schedule a second independent polling loop. + // Advance past one interval and confirm only one handler tick fires, + // not two. + await vi.advanceTimersByTimeAsync(3000); + await flushPromises(); + const deltaAfterSecond = mockWatchHistory.mock.calls.length - pollsAfterFirst; + expect(deltaAfterSecond).toBeLessThanOrEqual(1); }); }); }); diff --git a/client/src/stores/notificationsStore.test.ts b/client/src/stores/notificationsStore.test.ts index 4b7d8545cdab..f7ff095b18a0 100644 --- a/client/src/stores/notificationsStore.test.ts +++ b/client/src/stores/notificationsStore.test.ts @@ -1,27 +1,24 @@ import flushPromises from "flush-promises"; import { createPinia, setActivePinia } from "pinia"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -import { ref } from "vue"; import { useServerMock } from "@/api/client/__mocks__"; import type { UserNotification } from "@/api/notifications"; +import { emitSse, sseMockFactory, useVisibilityPatch } from "./_testing/sseStoreSupport"; import { useNotificationsStore } from "./notificationsStore"; -// Capture SSE composable usage without opening a real EventSource. -// The returned `connected` ref stays false by default — this is intentional -// because the store must NOT be relying on it; the decision is config-driven. -const mockSseConnect = vi.fn(); -const mockSseDisconnect = vi.fn(); -const mockSseConnected = ref(false); - -vi.mock("@/composables/useNotificationSSE", () => ({ - useSSE: vi.fn(() => ({ - connect: mockSseConnect, - disconnect: mockSseDisconnect, - connected: mockSseConnected, - })), -})); +// ``vi.mock`` is hoisted above module-level ``const`` declarations, so the +// capture-state has to be built via ``vi.hoisted`` to be visible to the factory. +const sseState = vi.hoisted(() => { + return { + onEvent: null as ((event: MessageEvent) => void) | null, + connect: vi.fn(), + disconnect: vi.fn(), + }; +}); + +vi.mock("@/composables/useNotificationSSE", () => sseMockFactory(sseState)); // Realistic fixture: a single unread notification, as returned by // GET /api/notifications. Shape mirrors UserNotification. @@ -51,17 +48,13 @@ const SCENARIO_STATUS_SINCE = { const { server, http } = useServerMock(); -function configResponse(overrides: Record) { - // The /api/configuration response carries many fields; the store only - // reads enable_notification_system, so a minimal object suffices. - return { enable_notification_system: false, ...overrides }; -} +const statusSpy = vi.fn(); function registerDefaultHandlers({ enableNotificationSystem }: { enableNotificationSystem: boolean }) { server.use( http.get("/api/configuration", ({ response }) => { // eslint-disable-next-line @typescript-eslint/no-explicit-any - return response(200).json(configResponse({ enable_notification_system: enableNotificationSystem }) as any); + return response(200).json({ enable_notification_system: enableNotificationSystem } as any); }), http.get("/api/notifications", ({ response }) => { return response(200).json([SCENARIO_NOTIFICATION]); @@ -70,22 +63,39 @@ function registerDefaultHandlers({ enableNotificationSystem }: { enableNotificat return response(200).json([]); }), http.get("/api/notifications/status", ({ response }) => { + statusSpy(); // eslint-disable-next-line @typescript-eslint/no-explicit-any return response(200).json(SCENARIO_STATUS_SINCE as any); }), ); } +/** Config load + initial fetch + store-decision watch needs a couple of ticks. */ +async function primeStore(startFn: () => Promise | void): Promise { + // Let the config-store fetch resolve before the store's `watch` runs. + await vi.runOnlyPendingTimersAsync(); + await startFn(); + // Two flush cycles: one for the config watch, one for the resulting fetch. + await flushPromises(); + await vi.runOnlyPendingTimersAsync(); + await flushPromises(); +} + describe("notificationsStore — config-driven SSE vs polling", () => { + let visibility: ReturnType; + beforeEach(() => { setActivePinia(createPinia()); - mockSseConnect.mockClear(); - mockSseDisconnect.mockClear(); - mockSseConnected.value = false; + sseState.connect.mockClear(); + sseState.disconnect.mockClear(); + sseState.onEvent = null; + statusSpy.mockClear(); vi.useFakeTimers(); + visibility = useVisibilityPatch(); }); afterEach(() => { + visibility.restore(); vi.useRealTimers(); }); @@ -96,27 +106,9 @@ describe("notificationsStore — config-driven SSE vs polling", () => { it("connects SSE and does not poll the status endpoint", async () => { const store = useNotificationsStore(); + await primeStore(() => store.startWatchingNotifications()); - // The store fires an initial load (GET /api/notifications + broadcasts) - // and then decides SSE vs polling based on the config flag. - const statusSpy = vi.fn(); - server.use( - http.get("/api/notifications/status", ({ response }) => { - statusSpy(); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - return response(200).json(SCENARIO_STATUS_SINCE as any); - }), - ); - - await vi.runOnlyPendingTimersAsync(); - await store.startWatchingNotifications(); - await flushPromises(); - - // The config load is async — let the watch fire. - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); - - expect(mockSseConnect).toHaveBeenCalledTimes(1); + expect(sseState.connect).toHaveBeenCalledTimes(1); // Advance well past the polling interval (30s) and confirm // the status endpoint is never polled while SSE is the active channel. @@ -127,38 +119,46 @@ describe("notificationsStore — config-driven SSE vs polling", () => { it("does not start polling when the tab regains visibility", async () => { const store = useNotificationsStore(); - const statusSpy = vi.fn(); - server.use( - http.get("/api/notifications/status", ({ response }) => { - statusSpy(); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - return response(200).json(SCENARIO_STATUS_SINCE as any); - }), - ); - - await vi.runOnlyPendingTimersAsync(); - await store.startWatchingNotifications(); - await flushPromises(); - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); + await primeStore(() => store.startWatchingNotifications()); - // Tab hide/show cycle — must not trigger the status endpoint. - Object.defineProperty(document, "visibilityState", { - configurable: true, - get: () => "hidden", - }); - document.dispatchEvent(new Event("visibilitychange")); - Object.defineProperty(document, "visibilityState", { - configurable: true, - get: () => "visible", - }); - document.dispatchEvent(new Event("visibilitychange")); + visibility.set("hidden"); + visibility.set("visible"); await flushPromises(); vi.advanceTimersByTime(120_000); await flushPromises(); expect(statusSpy).not.toHaveBeenCalled(); }); + + it("ingests notification_update events into the store state", async () => { + const store = useNotificationsStore(); + await primeStore(() => store.startWatchingNotifications()); + + const pushed = makeNotificationFixture({ + id: "notif-2", + content: { subject: "pushed via sse", message: "hi" }, + }); + emitSse(sseState, "notification_update", pushed); + await flushPromises(); + + expect(store.notifications.map((n) => n.id)).toContain("notif-2"); + expect(store.totalUnreadCount).toBeGreaterThan(0); + }); + + it("ingests notification_status catch-up events on reconnect", async () => { + const store = useNotificationsStore(); + await primeStore(() => store.startWatchingNotifications()); + + emitSse(sseState, "notification_status", { + total_unread_count: 42, + notifications: [makeNotificationFixture({ id: "notif-catchup" })], + broadcasts: [], + }); + await flushPromises(); + + expect(store.totalUnreadCount).toBe(42); + expect(store.notifications.map((n) => n.id)).toContain("notif-catchup"); + }); }); describe("when enable_notification_system is false (polling scenario)", () => { @@ -168,23 +168,9 @@ describe("notificationsStore — config-driven SSE vs polling", () => { it("does not connect SSE and polls the status endpoint on the configured interval", async () => { const store = useNotificationsStore(); + await primeStore(() => store.startWatchingNotifications()); - const statusSpy = vi.fn(); - server.use( - http.get("/api/notifications/status", ({ response }) => { - statusSpy(); - // eslint-disable-next-line @typescript-eslint/no-explicit-any - return response(200).json(SCENARIO_STATUS_SINCE as any); - }), - ); - - await vi.runOnlyPendingTimersAsync(); - await store.startWatchingNotifications(); - await flushPromises(); - await vi.runOnlyPendingTimersAsync(); - await flushPromises(); - - expect(mockSseConnect).not.toHaveBeenCalled(); + expect(sseState.connect).not.toHaveBeenCalled(); // Advance past the short polling interval (30s) and confirm // the status endpoint is hit by the resource watcher. diff --git a/lib/galaxy/app/__init__.py b/lib/galaxy/app/__init__.py index 1eac0e496d1b..991944962cf0 100644 --- a/lib/galaxy/app/__init__.py +++ b/lib/galaxy/app/__init__.py @@ -690,7 +690,13 @@ def __init__( # SSE dispatcher must be registered before NotificationManager so Lagom # can auto-inject the Optional[SSEEventDispatcher] constructor arg. - self._register_singleton(SSEEventDispatcher, SSEEventDispatcher(self)) + self._register_singleton( + SSEEventDispatcher, + SSEEventDispatcher( + queue_worker=getattr(self, "queue_worker", None), + application_stack=self.application_stack, + ), + ) self.notification_manager = self._register_singleton(NotificationManager) self.interactivetool_manager = InteractiveToolManager(self) @@ -849,8 +855,10 @@ def __init__(self, **kwargs) -> None: # amqp_internal_connection_obj and queue_worker are built in GalaxyManagerApplication # (so Celery workers also get a publisher); here we only register the consumer path, # which is started later via the application_stack postfork hook. - # SSE connection manager for real-time notification push - self.sse_connection_manager = self._register_singleton(SSEConnectionManager) + # SSE connection manager for real-time notification push. + # Consumed via ``depends(SSEConnectionManager)`` / ``app[SSEConnectionManager]``, + # so no module-level attribute is needed — keep the container wiring only. + self._register_singleton(SSEConnectionManager) # AI agent registry and service agent_registry = build_agent_registry(self.config) diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py index 3315ff6a6268..78da7ede854f 100644 --- a/lib/galaxy/managers/sse.py +++ b/lib/galaxy/managers/sse.py @@ -9,18 +9,42 @@ import logging from collections import defaultdict from dataclasses import dataclass +from datetime import datetime from typing import ( AsyncIterator, + Awaitable, + Callable, Optional, - TYPE_CHECKING, ) -if TYPE_CHECKING: - from starlette.requests import Request +from galaxy.model.orm.now import now log = logging.getLogger(__name__) +def make_event_id() -> str: + """Return an SSE ``id`` string for Last-Event-ID replay. + + Uses ``galaxy.model.orm.now`` so the timestamp format matches the rest of + Galaxy's database-backed timestamps (timezone-naive UTC). Kept in one place + so producers and the parse path cannot drift. + """ + return now().isoformat() + + +def parse_event_id(event_id: str) -> Optional[datetime]: + """Inverse of :func:`make_event_id`. Returns ``None`` if unparseable.""" + try: + return datetime.fromisoformat(event_id) + except (ValueError, TypeError): + return None + + +#: Async callable returning True when the client has disconnected. The SSE +#: stream loop polls this each iteration so managers don't depend on starlette. +IsDisconnected = Callable[[], Awaitable[bool]] + + @dataclass class SSEEvent: """An event to be sent to an SSE client.""" @@ -142,7 +166,7 @@ def total_connections(self) -> int: async def stream( self, - request: "Request", + is_disconnected: IsDisconnected, user_id: Optional[int], catch_up: Optional[SSEEvent] = None, keepalive: float = 30.0, @@ -151,15 +175,16 @@ async def stream( Handles ``connect``, optional catch-up event priming, the main event loop with a keepalive comment on timeout, disconnect detection, and - ``disconnect`` in ``finally``. Controllers should call this and return - the iterator wrapped in a ``StreamingResponse``. + ``disconnect`` in ``finally``. The ``is_disconnected`` callable is + what the service passes in (typically ``request.is_disconnected`` from + starlette) so the manager stays framework-agnostic. """ queue = self.connect(user_id) if catch_up is not None: await queue.put(catch_up) try: while True: - if await request.is_disconnected(): + if await is_disconnected(): break try: event: SSEEvent = await asyncio.wait_for(queue.get(), timeout=keepalive) diff --git a/lib/galaxy/managers/sse_dispatch.py b/lib/galaxy/managers/sse_dispatch.py index 35f6c690d37f..8db0cb6e63aa 100644 --- a/lib/galaxy/managers/sse_dispatch.py +++ b/lib/galaxy/managers/sse_dispatch.py @@ -9,17 +9,18 @@ """ import logging -from datetime import datetime from typing import ( + Any, Optional, - TYPE_CHECKING, ) -from galaxy.queue_worker import send_control_task +from galaxy.managers.sse import make_event_id +from galaxy.queue_worker import ( + ControlTask, + GalaxyQueueWorker, +) from galaxy.queues import all_control_queues_for_declare - -if TYPE_CHECKING: - from galaxy.structured_app import MinimalManagerApp +from galaxy.web_stack import ApplicationStack log = logging.getLogger(__name__) @@ -27,25 +28,35 @@ class SSEEventDispatcher: """Fans out SSE events across all Galaxy worker processes via the control queue. - Thin wrapper around ``send_control_task`` so managers can depend on a narrow, - injectable collaborator instead of reaching into the queue-worker module - directly. Works in both web-worker and Celery-worker contexts — - ``GalaxyManagerApplication`` sets up a publisher-only ``queue_worker`` for - the Celery side. + Dependencies are injected individually so the dispatcher can be unit-tested + without a full ``app``. ``queue_worker`` is ``Optional`` because unit-test + mock apps and Galaxy configurations without AMQP don't construct one — the + dispatcher silently no-ops in that case. """ - def __init__(self, app: "MinimalManagerApp") -> None: - self._app = app + def __init__( + self, + queue_worker: Optional[GalaxyQueueWorker], + application_stack: ApplicationStack, + ) -> None: + self._queue_worker = queue_worker + self._application_stack = application_stack - def _send(self, task: str, kwargs: dict) -> None: - if getattr(self._app, "queue_worker", None) is None: + def _send(self, task: str, kwargs: dict[str, Any]) -> None: + if self._queue_worker is None: # AMQP not configured at all (e.g. unit-test mock app). Skip silently. log.debug("SSE dispatch skipped: no queue_worker configured (task=%s)", task) return # Only fan out to webapp processes — job handlers and workflow schedulers # don't have browser SSE connections to push to. - declare_queues = all_control_queues_for_declare(self._app.application_stack, webapp_only=True) - send_control_task(self._app, task, kwargs=kwargs, expiration=10, declare_queues=declare_queues) + declare_queues = all_control_queues_for_declare(self._application_stack, webapp_only=True) + control_task = ControlTask(self._queue_worker) + control_task.send_task( + payload={"task": task, "kwargs": kwargs}, + routing_key="control.*", + expiration=10, + declare_queues=declare_queues, + ) def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str] = None) -> None: self._send( @@ -53,7 +64,7 @@ def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str { "user_ids": user_ids, "payload": payload, - "event_id": event_id or datetime.utcnow().isoformat(), + "event_id": event_id or make_event_id(), }, ) @@ -62,15 +73,15 @@ def notify_broadcast(self, payload: str, event_id: Optional[str] = None) -> None "notify_broadcast", { "payload": payload, - "event_id": event_id or datetime.utcnow().isoformat(), + "event_id": event_id or make_event_id(), }, ) - def history_update(self, user_updates: dict[str, list], event_id: Optional[str] = None) -> None: + def history_update(self, user_updates: dict[str, list[int]], event_id: Optional[str] = None) -> None: self._send( "history_update", { "user_updates": user_updates, - "event_id": event_id or datetime.utcnow().isoformat(), + "event_id": event_id or make_event_id(), }, ) diff --git a/lib/galaxy/queue_worker/__init__.py b/lib/galaxy/queue_worker/__init__.py index 6904a3a9e934..04824d8a8498 100644 --- a/lib/galaxy/queue_worker/__init__.py +++ b/lib/galaxy/queue_worker/__init__.py @@ -68,14 +68,14 @@ def send_local_control_task( def send_control_task( - app, - task, - noop_self=False, - get_response=False, - routing_key="control.*", - kwargs=None, + app: "StructuredApp", + task: str, + noop_self: bool = False, + get_response: bool = False, + routing_key: str = "control.*", + kwargs: Optional[dict] = None, expiration: Optional[int] = None, - declare_queues=None, + declare_queues: Optional[list[Queue]] = None, ): """ This sends a control task out to all processes, useful for things like @@ -137,13 +137,13 @@ def on_response(self, message): def send_task( self, - payload, - routing_key, - local=False, - get_response=False, - timeout=10, + payload: dict, + routing_key: str, + local: bool = False, + get_response: bool = False, + timeout: int = 10, expiration: Optional[int] = None, - declare_queues=None, + declare_queues: Optional[list[Queue]] = None, ): if local: declare_queues = self.control_queues @@ -359,7 +359,7 @@ def admin_job_lock(app, **kwargs): log.info(f"Administrative Job Lock is now set to {job_lock}. Jobs will {'not' if job_lock else 'now'} dispatch.") -def notify_users(app, **kwargs): +def notify_users(app: "MinimalManagerApp", **kwargs) -> None: """Push SSE events to connected users on this worker process.""" sse_manager = app[SSEConnectionManager] user_ids = kwargs.get("user_ids", []) @@ -370,7 +370,7 @@ def notify_users(app, **kwargs): sse_manager.push_to_user(user_id, event) -def notify_broadcast(app, **kwargs): +def notify_broadcast(app: "MinimalManagerApp", **kwargs) -> None: """Push SSE broadcast events to all connected clients on this worker process.""" sse_manager = app[SSEConnectionManager] payload = kwargs.get("payload", "{}") @@ -379,7 +379,7 @@ def notify_broadcast(app, **kwargs): sse_manager.push_broadcast(event) -def history_update(app, **kwargs): +def history_update(app: "MinimalManagerApp", **kwargs) -> None: """Push SSE history update events to connected users on this worker process. Encodes integer history IDs here (not in the monitor) so the manager layer diff --git a/lib/galaxy/queues/__init__.py b/lib/galaxy/queues/__init__.py index 9643a395595b..a5fab0efc5fe 100644 --- a/lib/galaxy/queues/__init__.py +++ b/lib/galaxy/queues/__init__.py @@ -7,7 +7,10 @@ import datetime import logging import socket -from typing import Optional +from typing import ( + Optional, + TYPE_CHECKING, +) from kombu import ( Connection, @@ -19,6 +22,9 @@ from galaxy.model import WorkerProcess from galaxy.model.orm.now import now +if TYPE_CHECKING: + from galaxy.web_stack import ApplicationStack + log = logging.getLogger(__name__) ALL_CONTROL = "control.*" @@ -29,7 +35,7 @@ WEBAPP_APP_TYPE = "webapp" -def all_control_queues_for_declare(application_stack, webapp_only: bool = False): +def all_control_queues_for_declare(application_stack: "ApplicationStack", webapp_only: bool = False) -> list[Queue]: """ For in-memory routing (used by sqlalchemy-based transports), we need to be able to build the entire routing table in producers. diff --git a/lib/galaxy/structured_app/__init__.py b/lib/galaxy/structured_app/__init__.py index 75b53be2ff18..af78cf03d31c 100644 --- a/lib/galaxy/structured_app/__init__.py +++ b/lib/galaxy/structured_app/__init__.py @@ -50,7 +50,6 @@ from galaxy.managers.histories import HistoryManager from galaxy.managers.interactivetool import InteractiveToolManager from galaxy.managers.jobs import JobSearch - from galaxy.managers.sse import SSEConnectionManager from galaxy.managers.tools import DynamicToolManager from galaxy.managers.users import UserManager from galaxy.managers.workflows import ( @@ -177,7 +176,6 @@ class StructuredApp(MinimalManagerApp): vault: Vault webhooks_registry: WebhooksRegistry queue_worker: Any # 'galaxy.queue_worker.GalaxyQueueWorker' - sse_connection_manager: "SSEConnectionManager" data_provider_registry: Any # 'galaxy.visualization.data_providers.registry.DataProviderRegistry' tool_cache: "ToolCache" tool_shed_repository_cache: Optional[ToolShedRepositoryCache] diff --git a/lib/galaxy/webapps/galaxy/api/events.py b/lib/galaxy/webapps/galaxy/api/events.py index ad9a3b2e9d9e..67120591227b 100644 --- a/lib/galaxy/webapps/galaxy/api/events.py +++ b/lib/galaxy/webapps/galaxy/api/events.py @@ -15,8 +15,7 @@ from starlette.responses import StreamingResponse from galaxy.managers.context import ProvidesUserContext -from galaxy.managers.sse import SSEConnectionManager -from galaxy.webapps.galaxy.services.notifications import NotificationService +from galaxy.webapps.galaxy.services.events import EventsService from . import ( depends, DependsOnTrans, @@ -30,8 +29,7 @@ @router.cbv class FastAPIEvents: - sse_manager: SSEConnectionManager = depends(SSEConnectionManager) - notifications: NotificationService = depends(NotificationService) + service: EventsService = depends(EventsService) @router.get( "/api/events/stream", @@ -43,7 +41,7 @@ async def stream_events( request: Request, trans: ProvidesUserContext = DependsOnTrans, last_event_id: Optional[str] = Header(None, alias="Last-Event-ID"), - ): + ) -> StreamingResponse: """Opens a Server-Sent Events (SSE) connection that pushes real-time updates for notifications, history changes, and other events. @@ -53,10 +51,8 @@ async def stream_events( Anonymous users receive only broadcast events. """ - user_id = trans.user.id if not trans.anonymous else None - catch_up = self.notifications.build_status_catchup(trans, last_event_id) return StreamingResponse( - self.sse_manager.stream(request, user_id, catch_up=catch_up), + self.service.open_stream(trans, last_event_id, request.is_disconnected), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", diff --git a/lib/galaxy/webapps/galaxy/api/notifications.py b/lib/galaxy/webapps/galaxy/api/notifications.py index ec490dc0dd1d..d14d004fb911 100644 --- a/lib/galaxy/webapps/galaxy/api/notifications.py +++ b/lib/galaxy/webapps/galaxy/api/notifications.py @@ -19,7 +19,6 @@ from starlette.responses import StreamingResponse from galaxy.managers.context import ProvidesUserContext -from galaxy.managers.sse import SSEConnectionManager from galaxy.schema.notifications import ( BroadcastNotificationCreateRequest, BroadcastNotificationListResponse, @@ -55,7 +54,6 @@ @router.cbv class FastAPINotifications: service: NotificationService = depends(NotificationService) - sse_manager: SSEConnectionManager = depends(SSEConnectionManager) @router.get( "/api/notifications/stream", @@ -67,7 +65,7 @@ async def stream_notifications( request: Request, trans: ProvidesUserContext = DependsOnTrans, last_event_id: Optional[str] = Header(None, alias="Last-Event-ID"), - ): + ) -> StreamingResponse: """Opens a Server-Sent Events (SSE) connection that pushes notification updates in real-time. On reconnect, the browser sends the ``Last-Event-ID`` header automatically. @@ -76,11 +74,8 @@ async def stream_notifications( Anonymous users receive only broadcast events. """ - self.service.notification_manager.ensure_notifications_enabled() - user_id = trans.user.id if not trans.anonymous else None - catch_up = self.service.build_status_catchup(trans, last_event_id) return StreamingResponse( - self.sse_manager.stream(request, user_id, catch_up=catch_up), + self.service.open_stream(trans, last_event_id, request.is_disconnected), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", diff --git a/lib/galaxy/webapps/galaxy/services/events.py b/lib/galaxy/webapps/galaxy/services/events.py new file mode 100644 index 000000000000..cddc9b7478ee --- /dev/null +++ b/lib/galaxy/webapps/galaxy/services/events.py @@ -0,0 +1,38 @@ +"""Service layer for the unified SSE events endpoint. + +Unlike :class:`NotificationService.open_stream`, this service does **not** +require the notification system to be enabled — ``/api/events/stream`` also +serves history updates and other event types independent of the notification +configuration. When notifications are disabled the catch-up event is simply +skipped; the stream still delivers other push events. +""" + +from typing import ( + AsyncIterator, + Optional, +) + +from galaxy.managers.context import ProvidesUserContext +from galaxy.managers.sse import ( + IsDisconnected, + SSEConnectionManager, +) +from galaxy.webapps.galaxy.services.base import ServiceBase +from galaxy.webapps.galaxy.services.notifications import NotificationService + + +class EventsService(ServiceBase): + def __init__(self, sse_manager: SSEConnectionManager, notifications: NotificationService): + self.sse_manager = sse_manager + self.notifications = notifications + + def open_stream( + self, + user_context: ProvidesUserContext, + last_event_id: Optional[str], + is_disconnected: IsDisconnected, + ) -> AsyncIterator[str]: + """Open an SSE events stream; anonymous users receive only broadcasts.""" + user_id = user_context.user.id if not user_context.anonymous else None + catch_up = self.notifications.build_status_catchup(user_context, last_event_id) + return self.sse_manager.stream(is_disconnected, user_id, catch_up=catch_up) diff --git a/lib/galaxy/webapps/galaxy/services/notifications.py b/lib/galaxy/webapps/galaxy/services/notifications.py index c11e0af8737f..74df0ffa6914 100644 --- a/lib/galaxy/webapps/galaxy/services/notifications.py +++ b/lib/galaxy/webapps/galaxy/services/notifications.py @@ -1,5 +1,6 @@ from datetime import datetime from typing import ( + AsyncIterator, NoReturn, Optional, Union, @@ -14,7 +15,13 @@ ) from galaxy.managers.context import ProvidesUserContext from galaxy.managers.notification import NotificationManager -from galaxy.managers.sse import SSEEvent +from galaxy.managers.sse import ( + IsDisconnected, + make_event_id, + parse_event_id, + SSEConnectionManager, + SSEEvent, +) from galaxy.model import User from galaxy.schema.fields import Security from galaxy.schema.notifications import ( @@ -43,8 +50,25 @@ class NotificationService(ServiceBase): - def __init__(self, notification_manager: NotificationManager): + def __init__(self, notification_manager: NotificationManager, sse_manager: SSEConnectionManager): self.notification_manager = notification_manager + self.sse_manager = sse_manager + + def open_stream( + self, + user_context: ProvidesUserContext, + last_event_id: Optional[str], + is_disconnected: IsDisconnected, + ) -> AsyncIterator[str]: + """Open an SSE notification stream for ``user_context``. + + Enforces the notifications-enabled guard, builds the optional catch-up, + and resolves the user id so the controller stays a thin wrapper. + """ + self.notification_manager.ensure_notifications_enabled() + user_id = user_context.user.id if not user_context.anonymous else None + catch_up = self.build_status_catchup(user_context, last_event_id) + return self.sse_manager.stream(is_disconnected, user_id, catch_up=catch_up) def send_notification( self, sender_context: ProvidesUserContext, payload: NotificationCreateRequestBody @@ -111,15 +135,14 @@ def build_status_catchup( """ if not last_event_id or not self.notification_manager.notifications_enabled: return None - try: - since = datetime.fromisoformat(last_event_id) - except (ValueError, TypeError): + since = parse_event_id(last_event_id) + if since is None: return None catchup = self.get_notifications_status(user_context, since) return SSEEvent( event="notification_status", data=catchup.model_dump_json(), - id=datetime.utcnow().isoformat(), + id=make_event_id(), ) def get_notifications_status(self, user_context: ProvidesUserContext, since: datetime) -> NotificationStatusSummary: diff --git a/test/integration/test_history_sse.py b/test/integration/test_history_sse.py index 6dd15c7afea1..4c6d3cc8e12c 100644 --- a/test/integration/test_history_sse.py +++ b/test/integration/test_history_sse.py @@ -34,31 +34,6 @@ def _create_history(self, name=None) -> str: self._assert_status_code_is_ok(response) return response.json()["id"] - def test_sse_events_endpoint_returns_event_stream(self): - """The /api/events/stream endpoint should return content-type text/event-stream.""" - response = requests.get( - self._events_stream_url(), - params={"key": self.galaxy_interactor.api_key}, - stream=True, - timeout=5, - ) - assert response.status_code == 200 - assert "text/event-stream" in response.headers.get("content-type", "") - response.close() - - def test_sse_receives_history_update_on_dataset_upload(self): - """When a dataset is uploaded, a history_update SSE event should be received.""" - history_id = self._create_history() - - listener = SSELineListener(self._events_stream_url(), self.galaxy_interactor.api_key) - listener.start() - try: - self.dataset_populator.new_dataset(history_id, wait=False) - history_events = listener.wait_for_event("history_update") - assert len(history_events) > 0 - finally: - listener.stop() - def test_history_update_contains_current_history_id(self): """The history_update event should contain the history's encoded ID.""" history_id = self._create_history() @@ -117,15 +92,3 @@ def test_history_update_is_scoped_to_owning_user(self): assert ( user_b_history_id not in seen_ids ), f"User A received history_update for user B's history ({user_b_history_id}): {history_events}" - - def test_existing_polling_api_still_works(self): - """The existing current_history_json endpoint should continue to work.""" - url = urljoin(self.url, "history/current_history_json") - response = requests.get( - url, - params={"key": self.galaxy_interactor.api_key}, - ) - assert response.status_code == 200 - data = response.json() - assert "id" in data - assert "update_time" in data diff --git a/test/integration/test_notification_sse.py b/test/integration/test_notification_sse.py index 5c8622514a42..1f7340618988 100644 --- a/test/integration/test_notification_sse.py +++ b/test/integration/test_notification_sse.py @@ -1,12 +1,10 @@ """Integration tests for the notification SSE (Server-Sent Events) endpoint.""" -from datetime import datetime +import json from typing import Optional from urllib.parse import urljoin from uuid import uuid4 -import requests - from galaxy_test.base.populators import DatasetPopulator from galaxy_test.base.sse import SSELineListener from galaxy_test.driver.integration_util import IntegrationTestCase @@ -38,6 +36,21 @@ def notification_broadcast_test_data(subject: Optional[str] = None, message: Opt } +def _notification_subjects(events: list[dict]) -> list[str]: + """Extract ``content.subject`` from each SSE ``data`` payload. + + Verifies JSON shape rather than substring-matching raw ``data`` strings — a + regression in the envelope (missing id, wrong serializer, content key + renamed) fails here instead of silently passing. Each ``data`` payload is + a ``NotificationResponse`` dump with a top-level ``content.subject``. + """ + subjects = [] + for event in events: + payload = json.loads(event["data"]) + subjects.append(payload["content"]["subject"]) + return subjects + + class TestNotificationSSEIntegration(IntegrationTestCase): dataset_populator: DatasetPopulator framework_tool_and_types = False @@ -55,18 +68,6 @@ def setUp(self): def _stream_url(self) -> str: return urljoin(self.url, "api/notifications/stream") - def test_sse_endpoint_returns_event_stream(self): - """The SSE endpoint should return content-type text/event-stream.""" - response = requests.get( - self._stream_url(), - params={"key": self.galaxy_interactor.api_key}, - stream=True, - timeout=5, - ) - assert response.status_code == 200 - assert "text/event-stream" in response.headers.get("content-type", "") - response.close() - def test_sse_receives_notification_events(self): """When a notification is created, the SSE stream should receive it.""" user = self._setup_user(f"{uuid4()}@galaxy.test") @@ -87,8 +88,8 @@ def test_sse_receives_notification_events(self): finally: listener.stop() - assert any( - subject in e.get("data", "") for e in notification_events + assert subject in _notification_subjects( + notification_events ), f"Expected subject '{subject}' in SSE events, got: {notification_events}" def test_sse_receives_broadcast_events(self): @@ -105,16 +106,17 @@ def test_sse_receives_broadcast_events(self): finally: listener.stop() - assert any( - subject in e.get("data", "") for e in broadcast_events - ), f"Expected subject '{subject}' in broadcast SSE events, got: {broadcast_events}" + # Broadcast events carry a BroadcastNotificationResponse, which shares + # the top-level content.subject shape with per-user notifications. + broadcast_subjects = [json.loads(e["data"])["content"]["subject"] for e in broadcast_events] + assert subject in broadcast_subjects, f"Expected subject '{subject}' in broadcast events: {broadcast_events}" def test_sse_catchup_on_reconnect(self): """Reconnecting with Last-Event-ID should replay a catch-up notification_status event. The ``Last-Event-ID`` value is the server-issued ID from a prior event, - not a client-side ``datetime.utcnow()``. This avoids clock-skew flake - between the test runner and the app in containerized CI. + not a client-side timestamp. This avoids clock-skew flake between the + test runner and the app in containerized CI. """ user = self._setup_user(f"{uuid4()}@galaxy.test") _, user_api_key = self._setup_user_get_key(user["email"]) @@ -146,8 +148,8 @@ def test_sse_catchup_on_reconnect(self): response = self._post("notifications", data=request, admin=True, json=True) self._assert_status_code_is_ok(response) - # Reconnect with Last-Event-ID = the captured id. The server catch-up runs before - # the `ready` event and must include the missed notification. + # Reconnect with Last-Event-ID = the captured id. The catch-up must include + # the notification sent after that id but not the one that produced it. listener_2 = SSELineListener( self._stream_url(), user_api_key, @@ -159,28 +161,11 @@ def test_sse_catchup_on_reconnect(self): finally: listener_2.stop() - assert any( - subject_2 in e.get("data", "") for e in status_events - ), f"Expected subject '{subject_2}' in catch-up event, got: {status_events}" - - def test_existing_polling_api_still_works(self): - """The existing polling endpoint should continue to work alongside SSE.""" - user = self._setup_user(f"{uuid4()}@galaxy.test") - - before = datetime.utcnow() - - subject = f"polling_test_{uuid4()}" - request = { - "recipients": {"user_ids": [user["id"]]}, - "notification": notification_test_data(subject=subject), - } - response = self._post("notifications", data=request, admin=True, json=True) - self._assert_status_code_is_ok(response) - - with self._different_user(user["email"]): - status_response = self._get(f"notifications/status?since={before.isoformat()}") - self._assert_status_code_is_ok(status_response) - status = status_response.json() - assert status["total_unread_count"] == 1 - assert len(status["notifications"]) == 1 - assert status["notifications"][0]["content"]["subject"] == subject + replayed_subjects: list[str] = [] + for event in status_events: + payload = json.loads(event["data"]) + replayed_subjects.extend(n["content"]["subject"] for n in payload.get("notifications", [])) + assert subject_2 in replayed_subjects, f"Missed catch-up of '{subject_2}': {status_events}" + assert ( + subject_1 not in replayed_subjects + ), f"Last-Event-ID did not filter — '{subject_1}' replayed: {status_events}" diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index e3e9a564c844..bb39e655bc03 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -15,6 +15,7 @@ from .framework import SeleniumIntegrationTestCase SSE_CONNECT_TIMEOUT_SECONDS = 15 +SSE_EVENT_TIMEOUT_SECONDS = 15 class TestNotificationSSESeleniumIntegration(SeleniumIntegrationTestCase): @@ -39,6 +40,23 @@ def _wait_for_sse_connected(self) -> None: timeout=SSE_CONNECT_TIMEOUT_SECONDS, ) + def _last_sse_event_ts(self) -> int: + """Return the last SSE event timestamp recorded by the composable, or 0.""" + return self.driver.execute_script("return window.__galaxy_sse_last_event_ts || 0") or 0 + + def _wait_for_sse_event_after(self, baseline_ts: int) -> None: + """Block until an SSE event arrives after ``baseline_ts``. + + Guards against a silent regression where the UI update originates from + the polling fallback rather than the SSE push: ``__galaxy_sse_last_event_ts`` + only advances when the composable's event listener fires. + """ + wait_on( + lambda: True if self._last_sse_event_ts() > baseline_ts else None, + "window.__galaxy_sse_last_event_ts advanced past baseline", + timeout=SSE_EVENT_TIMEOUT_SECONDS, + ) + @selenium_test @managed_history def test_notification_appears_via_sse(self): @@ -50,6 +68,7 @@ def test_notification_appears_via_sse(self): # Navigate to notifications page so the store is watching self.driver.get(f"{self.target_url_from_selenium}/user/notifications") self._wait_for_sse_connected() + baseline_ts = self._last_sse_event_ts() self.screenshot("notification_sse_before") # Send a notification to this user via the admin API @@ -70,10 +89,12 @@ def test_notification_appears_via_sse(self): response = self._post("notifications", data=notification_request, admin=True, json=True) self._assert_status_code_is_ok(response) - # Wait for the notification to appear in the UI — SSE should push it - # within a few seconds, without needing a page refresh. - # We wait up to 15 seconds checking for the subject text to appear. - self.driver.wait_for_selector_visible(f"text={subject}", timeout=15000) + # Prove the incoming update arrived via SSE: the event-timestamp hook + # only advances when useSSE's listener fires. If this times out while + # the UI still shows the notification, polling picked it up — a silent + # regression this assertion catches. + self._wait_for_sse_event_after(baseline_ts) + self.driver.wait_for_selector_visible(f"text={subject}", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) self.screenshot("notification_sse_after") @selenium_test @@ -86,6 +107,7 @@ def test_notification_bell_updates_via_sse(self): # Go to home page (bell is in masthead) self.home() self._wait_for_sse_connected() + baseline_ts = self._last_sse_event_ts() # Send a notification subject = f"Bell Test {uuid4()}" @@ -105,6 +127,7 @@ def test_notification_bell_updates_via_sse(self): response = self._post("notifications", data=notification_request, admin=True, json=True) self._assert_status_code_is_ok(response) + self._wait_for_sse_event_after(baseline_ts) # The indicator dot should appear on the bell (within the #activity-notifications element) - self.driver.wait_for_selector_visible("#activity-notifications .indicator", timeout=15000) + self.driver.wait_for_selector_visible("#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) self.screenshot("notification_bell_indicator") From 2515edacfccc72ce67ed5948f4d4680dec8bf992 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 15:30:21 +0200 Subject: [PATCH 10/47] Ruff --- lib/galaxy/managers/history_audit_monitor.py | 4 ++-- lib/galaxy/managers/sse.py | 8 +++++--- lib/galaxy/webapps/galaxy/services/events.py | 2 +- lib/galaxy/webapps/galaxy/services/notifications.py | 2 +- lib/galaxy_test/base/sse.py | 4 ++-- test/integration_selenium/test_notification_sse.py | 4 +++- 6 files changed, 14 insertions(+), 10 deletions(-) diff --git a/lib/galaxy/managers/history_audit_monitor.py b/lib/galaxy/managers/history_audit_monitor.py index e8f3fadd08e3..ec76cf94a2fe 100644 --- a/lib/galaxy/managers/history_audit_monitor.py +++ b/lib/galaxy/managers/history_audit_monitor.py @@ -15,13 +15,13 @@ defaultdict, OrderedDict, ) +from collections.abc import Iterator from datetime import ( datetime, timedelta, ) from typing import ( Any, - Iterator, Optional, ) @@ -120,7 +120,7 @@ def __init__( self._thread: Optional[threading.Thread] = None self._active = False # Bounded LRU cache: history_id -> user_id, refreshed on miss. - self._history_owner_cache: "OrderedDict[int, int]" = OrderedDict() + self._history_owner_cache: OrderedDict[int, int] = OrderedDict() def start(self) -> None: if self._active: diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py index 78da7ede854f..2c83a549e081 100644 --- a/lib/galaxy/managers/sse.py +++ b/lib/galaxy/managers/sse.py @@ -8,12 +8,14 @@ import asyncio import logging from collections import defaultdict -from dataclasses import dataclass -from datetime import datetime -from typing import ( +from collections.abc import ( AsyncIterator, Awaitable, Callable, +) +from dataclasses import dataclass +from datetime import datetime +from typing import ( Optional, ) diff --git a/lib/galaxy/webapps/galaxy/services/events.py b/lib/galaxy/webapps/galaxy/services/events.py index cddc9b7478ee..aea3998ac25e 100644 --- a/lib/galaxy/webapps/galaxy/services/events.py +++ b/lib/galaxy/webapps/galaxy/services/events.py @@ -7,8 +7,8 @@ skipped; the stream still delivers other push events. """ +from collections.abc import AsyncIterator from typing import ( - AsyncIterator, Optional, ) diff --git a/lib/galaxy/webapps/galaxy/services/notifications.py b/lib/galaxy/webapps/galaxy/services/notifications.py index 74df0ffa6914..b040e2b908e4 100644 --- a/lib/galaxy/webapps/galaxy/services/notifications.py +++ b/lib/galaxy/webapps/galaxy/services/notifications.py @@ -1,6 +1,6 @@ +from collections.abc import AsyncIterator from datetime import datetime from typing import ( - AsyncIterator, NoReturn, Optional, Union, diff --git a/lib/galaxy_test/base/sse.py b/lib/galaxy_test/base/sse.py index 47ee4172be4c..be7b3052bf17 100644 --- a/lib/galaxy_test/base/sse.py +++ b/lib/galaxy_test/base/sse.py @@ -71,7 +71,7 @@ def __init__( self._collected: list[str] = [] self._stop = threading.Event() self._ready = threading.Event() - self._errors: "queue.Queue[BaseException]" = queue.Queue() + self._errors: queue.Queue[BaseException] = queue.Queue() self._thread = threading.Thread(target=self._listen, daemon=True) def start(self) -> None: @@ -130,7 +130,7 @@ def _listen(self) -> None: if self._stop.is_set(): break resp.close() - except BaseException as exc: + except Exception as exc: self._errors.put(exc) # Ensure start() doesn't hang forever on connection failure. self._ready.set() diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index bb39e655bc03..70ad646c0caa 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -129,5 +129,7 @@ def test_notification_bell_updates_via_sse(self): self._wait_for_sse_event_after(baseline_ts) # The indicator dot should appear on the bell (within the #activity-notifications element) - self.driver.wait_for_selector_visible("#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) + self.driver.wait_for_selector_visible( + "#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000 + ) self.screenshot("notification_bell_indicator") From 079d4ce52d66bcfca8aedb172f325fff3ac83a17 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 17 Apr 2026 16:13:24 +0200 Subject: [PATCH 11/47] Add category discriminator to notification content fixtures The generated MessageNotificationContent schema now includes the category: "message" discriminator, so content literals in Partial overrides no longer narrow without it. --- client/src/stores/notificationsStore.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client/src/stores/notificationsStore.test.ts b/client/src/stores/notificationsStore.test.ts index f7ff095b18a0..ef568348f7b7 100644 --- a/client/src/stores/notificationsStore.test.ts +++ b/client/src/stores/notificationsStore.test.ts @@ -34,7 +34,7 @@ function makeNotificationFixture(overrides: Partial = {}): Use expiration_time: null, seen_time: null, deleted: false, - content: { subject: "hello", message: "welcome" }, + content: { category: "message", subject: "hello", message: "welcome" }, ...overrides, } as UserNotification; } @@ -136,7 +136,7 @@ describe("notificationsStore — config-driven SSE vs polling", () => { const pushed = makeNotificationFixture({ id: "notif-2", - content: { subject: "pushed via sse", message: "hi" }, + content: { category: "message", subject: "pushed via sse", message: "hi" }, }); emitSse(sseState, "notification_update", pushed); await flushPromises(); From 5ae948c1df177593ff571c5cb838b86ccc15fcad Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Sun, 19 Apr 2026 19:11:36 +0200 Subject: [PATCH 12/47] Fix mypy errors on sse-notifications branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move send_notification_internal from NotificationService to NotificationManager so non-web callers (psa_authnz, sharable service) can dispatch notifications without constructing a web-layer service — which is why the sse_manager constructor drift was caught by mypy in the first place. Relocate async_task_summary to galaxy.celery.helpers so the manager can call it without reaching into galaxy.webapps.galaxy.services.base; existing service callers keep working via an explicit re-export. Also narrow Optional api_key before SSELineListener in the SSE integration tests, and call wait_for_selector_visible on self rather than self.driver in the selenium SSE test. --- lib/galaxy/authnz/psa_authnz.py | 4 +-- lib/galaxy/celery/helpers.py | 25 ++++++++++++++++ lib/galaxy/managers/notification.py | 28 ++++++++++++++++++ lib/galaxy/webapps/galaxy/services/base.py | 26 +---------------- .../webapps/galaxy/services/notifications.py | 29 ++----------------- .../webapps/galaxy/services/sharable.py | 2 +- test/integration/test_history_sse.py | 8 +++-- test/integration/test_notification_sse.py | 4 ++- .../test_notification_sse.py | 4 +-- test/unit/authnz/test_psa_authnz.py | 25 ++++++++-------- 10 files changed, 81 insertions(+), 74 deletions(-) create mode 100644 lib/galaxy/celery/helpers.py diff --git a/lib/galaxy/authnz/psa_authnz.py b/lib/galaxy/authnz/psa_authnz.py index 5d1189b38370..75247adc4ce1 100644 --- a/lib/galaxy/authnz/psa_authnz.py +++ b/lib/galaxy/authnz/psa_authnz.py @@ -798,8 +798,6 @@ def _send_oidc_profile_update_notification(trans, user, updates: list[str]) -> N NotificationVariant, PersonalNotificationCategory, ) - from galaxy.webapps.galaxy.services.notifications import NotificationService - labels: dict[str, str] = { "email": "email address", "username": "public name", @@ -819,7 +817,7 @@ def _send_oidc_profile_update_notification(trans, user, updates: list[str]) -> N ), galaxy_url=None, ) - NotificationService(trans.app.notification_manager).send_notification_internal(request, force_sync=True) + trans.app.notification_manager.send_notification_internal(request, force_sync=True) except Exception as exc: log.warning("OIDC profile update notification failed for user %s: %s", user.id, exc) diff --git a/lib/galaxy/celery/helpers.py b/lib/galaxy/celery/helpers.py new file mode 100644 index 000000000000..e771e340cb1d --- /dev/null +++ b/lib/galaxy/celery/helpers.py @@ -0,0 +1,25 @@ +from celery.result import AsyncResult + +from galaxy.schema.schema import AsyncTaskResultSummary + + +def async_task_summary(async_result: AsyncResult) -> AsyncTaskResultSummary: + name = None + try: + name = async_result.name + except AttributeError: + # if backend is disabled, we won't have this + pass + queue = None + try: + queue = async_result.queue + except AttributeError: + # if backend is disabled, we won't have this + pass + + return AsyncTaskResultSummary( + id=str(async_result.id), + ignored=async_result.ignored, + name=name, + queue=queue, + ) diff --git a/lib/galaxy/managers/notification.py b/lib/galaxy/managers/notification.py index 081fad7b7aa9..676660342531 100644 --- a/lib/galaxy/managers/notification.py +++ b/lib/galaxy/managers/notification.py @@ -5,6 +5,7 @@ cast, NamedTuple, Optional, + Union, ) from urllib.parse import urlparse @@ -28,6 +29,7 @@ from typing_extensions import Protocol from galaxy import util +from galaxy.celery.helpers import async_task_summary from galaxy.config import ( GalaxyAppConfiguration, templates, @@ -57,6 +59,7 @@ NotificationBroadcastUpdateRequest, NotificationCategorySettings, NotificationChannelSettings, + NotificationCreatedResponse, NotificationCreateData, NotificationCreateRequest, NotificationRecipients, @@ -67,6 +70,7 @@ UserNotificationPreferences, UserNotificationUpdateRequest, ) +from galaxy.schema.schema import AsyncTaskResultSummary log = logging.getLogger(__name__) @@ -179,6 +183,30 @@ def send_notification_to_recipients(self, request: NotificationCreateRequest) -> return notification, notifications_sent + def send_notification_internal( + self, request: NotificationCreateRequest, force_sync: bool = False + ) -> Union[NotificationCreatedResponse, AsyncTaskResultSummary]: + """Sends a notification to a list of recipients (users, groups or roles). + + If `force_sync` is set to `True`, the notification recipients will be processed synchronously instead of + in a background task. + + Note: This function is meant for internal use from other callers that don't need to check sender permissions. + """ + if self.can_send_notifications_async and not force_sync: + # Local import: galaxy.celery.tasks imports NotificationManager at module load, + # so importing it at module level here would be a circular dependency. + from galaxy.celery.tasks import send_notification_to_recipients_async + + result = send_notification_to_recipients_async.delay(request) + return async_task_summary(result) + + notification, recipient_user_count = self.send_notification_to_recipients(request) + return NotificationCreatedResponse( + total_notifications_sent=recipient_user_count, + notification=NotificationResponse.model_validate(notification), + ) + def _create_associations(self, notification: Notification, users: list[User]) -> int: success_count = 0 for user in users: diff --git a/lib/galaxy/webapps/galaxy/services/base.py b/lib/galaxy/webapps/galaxy/services/base.py index 5e096c7a6191..d485d277d5ba 100644 --- a/lib/galaxy/webapps/galaxy/services/base.py +++ b/lib/galaxy/webapps/galaxy/services/base.py @@ -7,8 +7,7 @@ Optional, ) -from celery.result import AsyncResult - +from galaxy.celery.helpers import async_task_summary as async_task_summary # re-export for existing callers from galaxy.exceptions import ( AuthenticationRequired, ConfigDoesNotAllowException, @@ -32,7 +31,6 @@ ) from galaxy.schema.fields import EncodedDatabaseIdField from galaxy.schema.schema import ( - AsyncTaskResultSummary, ToolRequestDetailedModel, ToolRequestModel, ) @@ -185,28 +183,6 @@ def create_objects_from_store( ) -def async_task_summary(async_result: AsyncResult) -> AsyncTaskResultSummary: - name = None - try: - name = async_result.name - except AttributeError: - # if backend is disabled, we won't have this - pass - queue = None - try: - queue = async_result.queue - except AttributeError: - # if backend is disabled, we won't have this - pass - - return AsyncTaskResultSummary( - id=str(async_result.id), - ignored=async_result.ignored, - name=name, - queue=queue, - ) - - def _encode_tool_request(tool_request: ToolRequest, security: IdEncodingHelper) -> dict[str, Any]: """Encode request IDs using strongly-typed parameter walking.""" tool_source_model = tool_request.tool_source diff --git a/lib/galaxy/webapps/galaxy/services/notifications.py b/lib/galaxy/webapps/galaxy/services/notifications.py index b040e2b908e4..046f5d1718ad 100644 --- a/lib/galaxy/webapps/galaxy/services/notifications.py +++ b/lib/galaxy/webapps/galaxy/services/notifications.py @@ -6,7 +6,6 @@ Union, ) -from galaxy.celery.tasks import send_notification_to_recipients_async from galaxy.exceptions import ( AdminRequiredException, AuthenticationRequired, @@ -43,10 +42,7 @@ UserNotificationUpdateRequest, ) from galaxy.schema.schema import AsyncTaskResultSummary -from galaxy.webapps.galaxy.services.base import ( - async_task_summary, - ServiceBase, -) +from galaxy.webapps.galaxy.services.base import ServiceBase class NotificationService(ServiceBase): @@ -87,28 +83,7 @@ def send_notification( recipients=payload.recipients, galaxy_url=galaxy_url, ) - return self.send_notification_internal(request) - - def send_notification_internal( - self, request: NotificationCreateRequest, force_sync: bool = False - ) -> Union[NotificationCreatedResponse, AsyncTaskResultSummary]: - """Sends a notification to a list of recipients (users, groups or roles). - - If `force_sync` is set to `True`, the notification recipients will be processed synchronously instead of - in a background task. - - Note: This function is meant for internal use from other services that don't need to check sender permissions. - """ - if self.notification_manager.can_send_notifications_async and not force_sync: - result = send_notification_to_recipients_async.delay(request) - summary = async_task_summary(result) - return summary - - notification, recipient_user_count = self.notification_manager.send_notification_to_recipients(request) - return NotificationCreatedResponse( - total_notifications_sent=recipient_user_count, - notification=NotificationResponse.model_validate(notification), - ) + return self.notification_manager.send_notification_internal(request) def broadcast( self, sender_context: ProvidesUserContext, payload: BroadcastNotificationCreateRequest diff --git a/lib/galaxy/webapps/galaxy/services/sharable.py b/lib/galaxy/webapps/galaxy/services/sharable.py index 33697adf51f2..2fc7174ae061 100644 --- a/lib/galaxy/webapps/galaxy/services/sharable.py +++ b/lib/galaxy/webapps/galaxy/services/sharable.py @@ -187,7 +187,7 @@ def _send_notification_to_users( ) # We can set force_sync=True here because we already have the set of users to notify # and there is no need to resolve them asynchronously as no groups or roles are involved. - self.notification_service.send_notification_internal(request, force_sync=True) + self.notification_service.notification_manager.send_notification_internal(request, force_sync=True) class SharedItemNotificationFactory: diff --git a/test/integration/test_history_sse.py b/test/integration/test_history_sse.py index 4c6d3cc8e12c..18b17226e5a8 100644 --- a/test/integration/test_history_sse.py +++ b/test/integration/test_history_sse.py @@ -38,7 +38,9 @@ def test_history_update_contains_current_history_id(self): """The history_update event should contain the history's encoded ID.""" history_id = self._create_history() - listener = SSELineListener(self._events_stream_url(), self.galaxy_interactor.api_key) + api_key = self.galaxy_interactor.api_key + assert api_key is not None + listener = SSELineListener(self._events_stream_url(), api_key) listener.start() try: self.dataset_populator.new_dataset(history_id, wait=False) @@ -61,7 +63,9 @@ def test_history_update_is_scoped_to_owning_user(self): user_a_history_id = self._create_history() - listener = SSELineListener(self._events_stream_url(), self.galaxy_interactor.api_key) + api_key = self.galaxy_interactor.api_key + assert api_key is not None + listener = SSELineListener(self._events_stream_url(), api_key) listener.start() try: # User B creates a history and uploads to it. User A must NOT see this. diff --git a/test/integration/test_notification_sse.py b/test/integration/test_notification_sse.py index 1f7340618988..ecfc5c34f2a0 100644 --- a/test/integration/test_notification_sse.py +++ b/test/integration/test_notification_sse.py @@ -94,7 +94,9 @@ def test_sse_receives_notification_events(self): def test_sse_receives_broadcast_events(self): """When a broadcast is created, the SSE stream should receive it.""" - listener = SSELineListener(self._stream_url(), self.galaxy_interactor.api_key) + api_key = self.galaxy_interactor.api_key + assert api_key is not None + listener = SSELineListener(self._stream_url(), api_key) listener.start() try: subject = f"broadcast_sse_test_{uuid4()}" diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index 70ad646c0caa..715898bbd2eb 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -94,7 +94,7 @@ def test_notification_appears_via_sse(self): # the UI still shows the notification, polling picked it up — a silent # regression this assertion catches. self._wait_for_sse_event_after(baseline_ts) - self.driver.wait_for_selector_visible(f"text={subject}", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) + self.wait_for_selector_visible(f"text={subject}", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) self.screenshot("notification_sse_after") @selenium_test @@ -129,7 +129,7 @@ def test_notification_bell_updates_via_sse(self): self._wait_for_sse_event_after(baseline_ts) # The indicator dot should appear on the bell (within the #activity-notifications element) - self.driver.wait_for_selector_visible( + self.wait_for_selector_visible( "#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000 ) self.screenshot("notification_bell_indicator") diff --git a/test/unit/authnz/test_psa_authnz.py b/test/unit/authnz/test_psa_authnz.py index 6bd2429d0ebb..994d4aea12e1 100644 --- a/test/unit/authnz/test_psa_authnz.py +++ b/test/unit/authnz/test_psa_authnz.py @@ -9,10 +9,7 @@ ) from types import SimpleNamespace from typing import Optional -from unittest.mock import ( - MagicMock, - patch, -) +from unittest.mock import MagicMock import jwt import pytest @@ -400,15 +397,16 @@ def test_oidc_config_custom_auth_pipeline_and_extra(mock_oidc_config_file, mock_ def test_sync_user_profile_skips_when_account_interface_enabled(): manager = MagicMock() session = MagicMock() + notify = MagicMock() app_config = SimpleNamespace(enable_account_interface=True, enable_notification_system=True) - app = SimpleNamespace(config=app_config, user_manager=manager, notification_manager=SimpleNamespace()) + notification_manager = SimpleNamespace(send_notification_internal=notify) + app = SimpleNamespace(config=app_config, user_manager=manager, notification_manager=notification_manager) trans = SimpleNamespace(app=app, sa_session=session) strategy = SimpleNamespace(config={"GALAXY_TRANS": trans, "FIXED_DELEGATED_AUTH": True}) user = SimpleNamespace(id=1, preferences={}) details = {"email": "new@example.com", "username": "newname"} - with patch("galaxy.webapps.galaxy.services.notifications.NotificationService.send_notification_internal") as notify: - sync_user_profile(strategy=strategy, details=details, user=user) + sync_user_profile(strategy=strategy, details=details, user=user) manager.update_email.assert_not_called() manager.update_username.assert_not_called() @@ -419,15 +417,16 @@ def test_sync_user_profile_skips_when_account_interface_enabled(): def test_sync_user_profile_skips_when_fixed_delegated_auth_disabled(): manager = MagicMock() session = MagicMock() + notify = MagicMock() app_config = SimpleNamespace(enable_account_interface=False, enable_notification_system=True) - app = SimpleNamespace(config=app_config, user_manager=manager, notification_manager=SimpleNamespace()) + notification_manager = SimpleNamespace(send_notification_internal=notify) + app = SimpleNamespace(config=app_config, user_manager=manager, notification_manager=notification_manager) trans = SimpleNamespace(app=app, sa_session=session) strategy = SimpleNamespace(config={"GALAXY_TRANS": trans, "FIXED_DELEGATED_AUTH": False}) user = SimpleNamespace(id=2, email="old@example.com", username="oldname", preferences={}) details = {"email": "new@example.com", "username": "newname"} - with patch("galaxy.webapps.galaxy.services.notifications.NotificationService.send_notification_internal") as notify: - sync_user_profile(strategy=strategy, details=details, user=user) + sync_user_profile(strategy=strategy, details=details, user=user) manager.update_email.assert_not_called() manager.update_username.assert_not_called() @@ -438,16 +437,16 @@ def test_sync_user_profile_skips_when_fixed_delegated_auth_disabled(): def test_sync_user_profile_updates_when_account_interface_disabled(): manager = MagicMock() session = MagicMock() + notify = MagicMock() app_config = SimpleNamespace(enable_account_interface=False, enable_notification_system=True) - notification_manager = SimpleNamespace(notifications_enabled=True) + notification_manager = SimpleNamespace(notifications_enabled=True, send_notification_internal=notify) app = SimpleNamespace(config=app_config, user_manager=manager, notification_manager=notification_manager) trans = SimpleNamespace(app=app, sa_session=session) strategy = SimpleNamespace(config={"GALAXY_TRANS": trans, "FIXED_DELEGATED_AUTH": True}) user = SimpleNamespace(id=2, email="old@example.com", username="oldname", preferences={}) details = {"email": "new@example.com", "username": "newname"} - with patch("galaxy.webapps.galaxy.services.notifications.NotificationService.send_notification_internal") as notify: - sync_user_profile(strategy=strategy, details=details, user=user) + sync_user_profile(strategy=strategy, details=details, user=user) manager.update_email.assert_called_once_with( trans, user, "new@example.com", commit=False, send_activation_email=False From 6e5eada9441285f309aa4b71e0ba9ca10a2c81d7 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 20 Apr 2026 14:35:21 +0200 Subject: [PATCH 13/47] Add SSE entry-point channel, dispatch observability, declare-queue cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related additions to the SSE notification pipeline, bundled here because they share ``SSEEventDispatcher._send`` as their modification point: 1. Interactive-tool entry-point SSE channel - ``entry_point_update`` dispatcher method + queue-worker handler. - ``InteractiveToolManager.configure_entry_points`` dispatches a wake-up event after the DB commit; the client refetches ``/api/entry_points`` on receipt (no payload). - Frontend: new SSE event type, store subscription with XOR polling fallback via ``enable_sse_entry_point_updates`` config flag. - Integration + selenium tests. 2. Queue and SSE observability metrics - Counters, timers, and periodic gauges for SSE dispatch, control-queue task execution, control-queue depth (via kombu passive declare), active SSE connections, dropped events, and active WorkerProcess rows. Flow through the existing ``galaxy_statsd_client`` — no new infra. Gauges are scheduled via Celery beat at ``queue_metrics_interval`` seconds (default 15). All instrumentation no-ops when statsd isn't configured. - Sub-emitter failures log once at WARNING and bump a ``galaxy.queue_metrics.error`` counter tagged by emitter so broken emitters are visible in metrics without log spam. 3. Active-worker control-queue cache - 30 s TTL cache on ``all_control_queues_for_declare`` with RLock stampede protection. At 1000+ events/s this eliminates ~30 DB round-trips/s per webapp for data that only changes on 60 s heartbeat cadence. Empty results are not cached — would otherwise silently drop every SSE event during the startup window. --- client/src/composables/useNotificationSSE.ts | 1 + client/src/stores/_testing/sseStoreSupport.ts | 8 +- client/src/stores/entryPointStore.test.js | 15 +- client/src/stores/entryPointStore.ts | 97 +++++-- doc/source/admin/galaxy_options.rst | 26 ++ lib/galaxy/app/__init__.py | 8 +- lib/galaxy/app_unittest_utils/galaxy_mock.py | 3 +- lib/galaxy/authnz/psa_authnz.py | 1 + lib/galaxy/celery/__init__.py | 3 + lib/galaxy/celery/tasks.py | 13 +- lib/galaxy/config/sample/galaxy.yml.sample | 12 + lib/galaxy/config/schemas/config_schema.yml | 18 ++ lib/galaxy/managers/interactivetool.py | 25 +- lib/galaxy/managers/notification.py | 2 +- lib/galaxy/managers/sse.py | 23 +- lib/galaxy/managers/sse_dispatch.py | 75 +++++- lib/galaxy/model/unittest_utils/data_app.py | 6 + lib/galaxy/queue_worker/__init__.py | 92 ++++++- lib/galaxy/structured_app/__init__.py | 1 + lib/galaxy/webapps/galaxy/api/tool_data.py | 2 +- lib/galaxy/webapps/galaxy/metrics/__init__.py | 0 .../webapps/galaxy/metrics/queue_metrics.py | 147 +++++++++++ lib/galaxy/webapps/galaxy/services/base.py | 1 - .../webapps/galaxy/services/datasets.py | 6 +- .../webapps/galaxy/services/histories.py | 2 +- .../galaxy/services/history_contents.py | 2 +- .../webapps/galaxy/services/invocations.py | 2 +- lib/galaxy/webapps/galaxy/services/jobs.py | 2 +- lib/galaxy/webapps/galaxy/services/pages.py | 2 +- lib/galaxy/webapps/galaxy/services/users.py | 6 +- test/integration/test_entry_point_sse.py | 158 ++++++++++++ .../test_entry_point_sse.py | 107 ++++++++ .../test_notification_sse.py | 4 +- test/unit/app/managers/test_queue_metrics.py | 243 ++++++++++++++++++ test/unit/app/managers/test_sse_dispatch.py | 226 ++++++++++++++++ .../app/managers/test_sse_dispatch_cache.py | 140 ++++++++++ .../app/queue_worker/test_queue_worker.py | 111 ++++++++ 37 files changed, 1528 insertions(+), 62 deletions(-) create mode 100644 lib/galaxy/webapps/galaxy/metrics/__init__.py create mode 100644 lib/galaxy/webapps/galaxy/metrics/queue_metrics.py create mode 100644 test/integration/test_entry_point_sse.py create mode 100644 test/integration_selenium/test_entry_point_sse.py create mode 100644 test/unit/app/managers/test_queue_metrics.py create mode 100644 test/unit/app/managers/test_sse_dispatch.py create mode 100644 test/unit/app/managers/test_sse_dispatch_cache.py diff --git a/client/src/composables/useNotificationSSE.ts b/client/src/composables/useNotificationSSE.ts index b0d34d8e8a5e..944af5fb8d6b 100644 --- a/client/src/composables/useNotificationSSE.ts +++ b/client/src/composables/useNotificationSSE.ts @@ -10,6 +10,7 @@ export const SSE_EVENT_TYPES = [ "broadcast_update", "notification_status", "history_update", + "entry_point_update", ] as const; export type SSEEventType = (typeof SSE_EVENT_TYPES)[number]; diff --git a/client/src/stores/_testing/sseStoreSupport.ts b/client/src/stores/_testing/sseStoreSupport.ts index 4613430bb254..b3e5d97fcb89 100644 --- a/client/src/stores/_testing/sseStoreSupport.ts +++ b/client/src/stores/_testing/sseStoreSupport.ts @@ -20,14 +20,20 @@ export interface SSEMockState { onEvent: ((event: MessageEvent) => void) | null; connect: ReturnType; disconnect: ReturnType; + connected?: Ref; } /** Build the factory used with ``vi.mock("@/composables/useNotificationSSE", ...)``. */ export function sseMockFactory(state: SSEMockState) { + // Lazily initialize ``connected`` so existing callers that don't pass it + // still get a working ref. + if (!state.connected) { + state.connected = ref(false); + } return { useSSE: vi.fn((onEvent: (event: MessageEvent) => void) => { state.onEvent = onEvent; - return { connect: state.connect, disconnect: state.disconnect }; + return { connect: state.connect, disconnect: state.disconnect, connected: state.connected }; }), }; } diff --git a/client/src/stores/entryPointStore.test.js b/client/src/stores/entryPointStore.test.js index d0c6b69b4573..fdae2799d306 100644 --- a/client/src/stores/entryPointStore.test.js +++ b/client/src/stores/entryPointStore.test.js @@ -1,12 +1,25 @@ import flushPromises from "flush-promises"; import { createPinia, setActivePinia } from "pinia"; -import { beforeEach, describe, expect, it } from "vitest"; +import { beforeEach, describe, expect, it, vi } from "vitest"; import { HttpResponse, useServerMock } from "@/api/client/__mocks__"; import testInteractiveToolsResponse from "../components/InteractiveTools/testData/testInteractiveToolsResponse"; +import { sseMockFactory } from "./_testing/sseStoreSupport"; import { useEntryPointStore } from "./entryPointStore"; +// ``vi.mock`` is hoisted above module-level declarations, so the capture-state +// has to be built via ``vi.hoisted`` to be visible to the factory. Prevents +// these tests from opening a real EventSource against ``/api/events/stream`` +// when ``useEntryPointStore()`` is invoked. +const sseState = vi.hoisted(() => ({ + onEvent: null, + connect: vi.fn(), + disconnect: vi.fn(), + connected: null, +})); +vi.mock("@/composables/useNotificationSSE", () => sseMockFactory(sseState)); + const { server, http } = useServerMock(); describe("stores/EntryPointStore", () => { diff --git a/client/src/stores/entryPointStore.ts b/client/src/stores/entryPointStore.ts index 805124a713d5..68193406967f 100644 --- a/client/src/stores/entryPointStore.ts +++ b/client/src/stores/entryPointStore.ts @@ -1,10 +1,12 @@ import axios from "axios"; import isEqual from "lodash.isequal"; import { defineStore } from "pinia"; -import { computed, ref } from "vue"; +import { computed, ref, watch } from "vue"; import { useResourceWatcher } from "@/composables/resourceWatcher"; +import { useSSE } from "@/composables/useNotificationSSE"; import { getAppRoot } from "@/onload/loadConfig"; +import { useConfigStore } from "@/stores/configurationStore"; import { rethrowSimple } from "@/utils/simple-error"; const ACTIVE_POLLING_INTERVAL = 10000; @@ -23,23 +25,8 @@ interface EntryPoint { } export const useEntryPointStore = defineStore("entryPointStore", () => { - const { startWatchingResource: startWatchingEntryPoints, stopWatchingResource: stopWatchingEntryPoints } = - useResourceWatcher(fetchEntryPoints, { - shortPollingInterval: ACTIVE_POLLING_INTERVAL, - enableBackgroundPolling: false, // No need to poll in the background - }); - const entryPoints = ref([]); - const entryPointsForJob = computed(() => { - return (jobId: string) => entryPoints.value.filter((entryPoint) => entryPoint["job_id"] === jobId); - }); - - const entryPointsForHda = computed(() => { - return (hdaId: string) => - entryPoints.value.filter((entryPoint) => entryPoint["output_datasets_ids"].includes(hdaId)); - }); - async function fetchEntryPoints() { const url = `${getAppRoot()}api/entry_points`; const params = { running: true }; @@ -51,6 +38,79 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { } } + // SSE-driven path: on each entry_point_update signal, refetch the canonical + // list from REST. The event carries no data — it's a pure wake-up. + function handleEntryPointSSEEvent(_event: MessageEvent) { + fetchEntryPoints().catch((err) => console.error("Error refreshing entry points from SSE push:", err)); + } + const { + connect: sseConnect, + disconnect: sseDisconnect, + connected: sseConnected, + } = useSSE(handleEntryPointSSEEvent, ["entry_point_update"]); + + let watchingInitialized = false; + let stopWatchingEntryPointsResource: (() => void) | null = null; + + // Callers opt in via ``startWatchingEntryPoints()`` (App.vue gates this on + // ``interactivetools_enable``). We then pick SSE or polling based on the + // server flag — mutually exclusive, mirroring historyStore / notificationsStore. + // ``useConfigStore`` is resolved lazily here so tests that only exercise + // the data methods don't need a ``/api/configuration`` handler registered. + function startWatchingEntryPoints() { + if (watchingInitialized) { + return; + } + watchingInitialized = true; + const configStore = useConfigStore(); + + const decide = () => { + if (configStore.config?.enable_sse_entry_point_updates) { + // Baseline fetch + SSE. Reconnect-refetch closes the "user + // navigated away and missed events" window. + fetchEntryPoints().catch((err) => console.warn("Initial entry-point load failed", err)); + sseConnect(); + watch(sseConnected, (isConnected, wasConnected) => { + if (isConnected && !wasConnected) { + fetchEntryPoints().catch((err) => + console.error("Error refreshing entry points on SSE reconnect:", err), + ); + } + }); + } else { + const { startWatchingResource, stopWatchingResource } = useResourceWatcher(fetchEntryPoints, { + shortPollingInterval: ACTIVE_POLLING_INTERVAL, + enableBackgroundPolling: false, + }); + stopWatchingEntryPointsResource = stopWatchingResource; + startWatchingResource(); + } + }; + + if (configStore.isLoaded) { + decide(); + } else { + const stop = watch( + () => configStore.isLoaded, + (loaded) => { + if (loaded) { + stop(); + decide(); + } + }, + ); + } + } + + const entryPointsForJob = computed(() => { + return (jobId: string) => entryPoints.value.filter((entryPoint) => entryPoint["job_id"] === jobId); + }); + + const entryPointsForHda = computed(() => { + return (hdaId: string) => + entryPoints.value.filter((entryPoint) => entryPoint["output_datasets_ids"].includes(hdaId)); + }); + function updateEntryPoints(data: EntryPoint[]) { let hasChanged = entryPoints.value.length !== data.length ? true : false; if (entryPoints.value.length === 0) { @@ -76,6 +136,11 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { return { ...original, ...updated }; } + function stopWatchingEntryPoints() { + sseDisconnect(); + stopWatchingEntryPointsResource?.(); + } + function removeEntryPoint(toolId: string) { const index = entryPoints.value.findIndex((ep) => { return ep.id === toolId ? true : false; diff --git a/doc/source/admin/galaxy_options.rst b/doc/source/admin/galaxy_options.rst index da52bf02d820..e87d6ac0ef40 100644 --- a/doc/source/admin/galaxy_options.rst +++ b/doc/source/admin/galaxy_options.rst @@ -3408,6 +3408,18 @@ :Type: bool +~~~~~~~~~~~~~~~~~~~~~~~~~~ +``queue_metrics_interval`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + How often (in seconds) the Celery beat task emits queue-depth, + SSE-connection, and WorkerProcess gauges. Only active when + statsd_host is set. Set to 0 to disable. +:Default: ``15`` +:Type: int + + ~~~~~~~~~~~~~~~~~~~~~~ ``library_import_dir`` ~~~~~~~~~~~~~~~~~~~~~~ @@ -5818,6 +5830,20 @@ :Type: bool +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``enable_sse_entry_point_updates`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + Enables real-time interactive-tool entry-point update + notifications via Server-Sent Events. When enabled, the client + subscribes to entry_point_update SSE events and refetches the + entry-point list on each event, replacing the 10-second polling + loop. When disabled, polling remains the source of updates. +:Default: ``false`` +:Type: bool + + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``history_audit_monitor_poll_interval`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/lib/galaxy/app/__init__.py b/lib/galaxy/app/__init__.py index 991944962cf0..28ed51e4c3a3 100644 --- a/lib/galaxy/app/__init__.py +++ b/lib/galaxy/app/__init__.py @@ -695,6 +695,7 @@ def __init__( SSEEventDispatcher( queue_worker=getattr(self, "queue_worker", None), application_stack=self.application_stack, + statsd_client=self.execution_timer_factory.galaxy_statsd_client, ), ) self.notification_manager = self._register_singleton(NotificationManager) @@ -858,7 +859,12 @@ def __init__(self, **kwargs) -> None: # SSE connection manager for real-time notification push. # Consumed via ``depends(SSEConnectionManager)`` / ``app[SSEConnectionManager]``, # so no module-level attribute is needed — keep the container wiring only. - self._register_singleton(SSEConnectionManager) + self._register_singleton( + SSEConnectionManager, + SSEConnectionManager( + statsd_client=self.execution_timer_factory.galaxy_statsd_client, + ), + ) # AI agent registry and service agent_registry = build_agent_registry(self.config) diff --git a/lib/galaxy/app_unittest_utils/galaxy_mock.py b/lib/galaxy/app_unittest_utils/galaxy_mock.py index 82a0b7886655..9f4fe40139e4 100644 --- a/lib/galaxy/app_unittest_utils/galaxy_mock.py +++ b/lib/galaxy/app_unittest_utils/galaxy_mock.py @@ -119,6 +119,7 @@ class MockApp(di.Container, GalaxyDataTestApp): history_manager: HistoryManager job_metrics: JobMetrics vault: Optional[Vault] = None + execution_timer_factory: Any stop: bool is_webapp: bool = True @@ -159,7 +160,7 @@ def __init__(self, config=None, **kwargs) -> None: self.application_stack = ApplicationStack() self.auth_manager = AuthManager(self.config) self.user_manager = UserManager(cast(BasicSharedApp, self)) - self.execution_timer_factory = Bunch(get_timer=StructuredExecutionTimer) + self.execution_timer_factory = Bunch(get_timer=StructuredExecutionTimer, galaxy_statsd_client=None) self.interactivetool_manager = Bunch(create_interactivetool=lambda *args, **kwargs: None) self.is_job_handler = False self.biotools_metadata_source = None diff --git a/lib/galaxy/authnz/psa_authnz.py b/lib/galaxy/authnz/psa_authnz.py index 75247adc4ce1..022272e3d9de 100644 --- a/lib/galaxy/authnz/psa_authnz.py +++ b/lib/galaxy/authnz/psa_authnz.py @@ -798,6 +798,7 @@ def _send_oidc_profile_update_notification(trans, user, updates: list[str]) -> N NotificationVariant, PersonalNotificationCategory, ) + labels: dict[str, str] = { "email": "email address", "username": "public name", diff --git a/lib/galaxy/celery/__init__.py b/lib/galaxy/celery/__init__.py index 67917cca6580..dce38961f3cc 100644 --- a/lib/galaxy/celery/__init__.py +++ b/lib/galaxy/celery/__init__.py @@ -252,6 +252,9 @@ def schedule_task(task, interval): schedule_task("prune_history_audit_table", config.history_audit_table_prune_interval) schedule_task("cleanup_short_term_storage", config.short_term_storage_cleanup_interval) + if config.statsd_host: + schedule_task("emit_queue_metrics_task", config.queue_metrics_interval) + if config.enable_notification_system: schedule_task("cleanup_expired_notifications", config.expired_notifications_cleanup_interval) schedule_task("dispatch_pending_notifications", config.dispatch_notifications_interval) diff --git a/lib/galaxy/celery/tasks.py b/lib/galaxy/celery/tasks.py index b630fdce26a7..eef0c951bbbf 100644 --- a/lib/galaxy/celery/tasks.py +++ b/lib/galaxy/celery/tasks.py @@ -76,7 +76,10 @@ Vault, ) from galaxy.short_term_storage import ShortTermStorageMonitor -from galaxy.structured_app import MinimalManagerApp +from galaxy.structured_app import ( + MinimalManagerApp, + StructuredApp, +) from galaxy.tools import create_tool_from_representation from galaxy.tools.data_fetch import do_fetch from galaxy.util import galaxy_directory @@ -628,6 +631,14 @@ def dispatch_pending_notifications(notification_manager: NotificationManager): log.info(f"Successfully dispatched {count} notifications.") +@galaxy_task(action="emit queue and SSE observability metrics") +def emit_queue_metrics_task(app: StructuredApp): + """Sample control-queue depth, SSE connection count, and worker rows → statsd.""" + from galaxy.webapps.galaxy.metrics.queue_metrics import emit_queue_metrics + + emit_queue_metrics(app) + + @galaxy_task(action="clean up job working directories") def cleanup_jwds(sa_session: galaxy_scoped_session, object_store: BaseObjectStore, config: GalaxyAppConfiguration): """Cleanup job working directories for failed jobs that are older than X days""" diff --git a/lib/galaxy/config/sample/galaxy.yml.sample b/lib/galaxy/config/sample/galaxy.yml.sample index 34ae86e4b1bd..b4fe9d9556eb 100644 --- a/lib/galaxy/config/sample/galaxy.yml.sample +++ b/lib/galaxy/config/sample/galaxy.yml.sample @@ -1962,6 +1962,11 @@ galaxy: # really. Do not set this in production environments. #statsd_mock_calls: false + # How often (in seconds) the Celery beat task emits queue-depth, + # SSE-connection, and WorkerProcess gauges. Only active when + # statsd_host is set. Set to 0 to disable. + #queue_metrics_interval: 15 + # Add an option to the library upload form which allows administrators # to upload a directory of files. #library_import_dir: null @@ -3136,6 +3141,13 @@ galaxy: # browsers, replacing aggressive 3-second polling. #enable_sse_history_updates: false + # Enables real-time interactive-tool entry-point update notifications + # via Server-Sent Events. When enabled, the client subscribes to + # entry_point_update SSE events and refetches the entry-point list on + # each event, replacing the 10-second polling loop. When disabled, + # polling remains the source of updates. + #enable_sse_entry_point_updates: false + # The interval in seconds between history audit table polls when using # the polling fallback (SQLite or when PostgreSQL LISTEN/NOTIFY is # unavailable). Only used when enable_sse_history_updates is true. diff --git a/lib/galaxy/config/schemas/config_schema.yml b/lib/galaxy/config/schemas/config_schema.yml index aba88b87cee4..cbba5927ac7a 100644 --- a/lib/galaxy/config/schemas/config_schema.yml +++ b/lib/galaxy/config/schemas/config_schema.yml @@ -2512,6 +2512,14 @@ mapping: Mock out statsd client calls - only used by testing infrastructure really. Do not set this in production environments. + queue_metrics_interval: + type: int + default: 15 + required: false + desc: | + How often (in seconds) the Celery beat task emits queue-depth, SSE-connection, + and WorkerProcess gauges. Only active when statsd_host is set. Set to 0 to disable. + library_import_dir: type: str required: false @@ -4303,6 +4311,16 @@ mapping: LISTEN/NOTIFY or audit table polling as a fallback for SQLite) and pushes update signals to connected browsers, replacing aggressive 3-second polling. + enable_sse_entry_point_updates: + type: bool + default: false + required: false + desc: | + Enables real-time interactive-tool entry-point update notifications via + Server-Sent Events. When enabled, the client subscribes to entry_point_update + SSE events and refetches the entry-point list on each event, replacing the + 10-second polling loop. When disabled, polling remains the source of updates. + history_audit_monitor_poll_interval: type: int default: 2 diff --git a/lib/galaxy/managers/interactivetool.py b/lib/galaxy/managers/interactivetool.py index 009d8f8dc36f..21b872f146ef 100644 --- a/lib/galaxy/managers/interactivetool.py +++ b/lib/galaxy/managers/interactivetool.py @@ -6,6 +6,7 @@ ) from typing import ( Any, + Optional, TYPE_CHECKING, Union, ) @@ -28,6 +29,7 @@ ) from galaxy import exceptions +from galaxy.managers.sse_dispatch import SSEEventDispatcher from galaxy.model import ( InteractiveToolEntryPoint, Job, @@ -147,7 +149,11 @@ class InteractiveToolManager: Manager for dealing with InteractiveTools """ - def __init__(self, app: "MinimalManagerApp") -> None: + def __init__( + self, + app: "MinimalManagerApp", + dispatcher: Optional[SSEEventDispatcher] = None, + ) -> None: self.app = app self.security = app.security self.sa_session = app.model.context @@ -157,6 +163,12 @@ def __init__(self, app: "MinimalManagerApp") -> None: app.config.interactivetoolsproxy_map or app.config.interactivetools_map, self.encoder.encode_id, ) + # Lagom can't auto-inject ``SSEEventDispatcher`` here because the + # ``app: "MinimalManagerApp"`` hint is only a forward reference + # (TYPE_CHECKING import), so ``get_type_hints`` on this signature + # fails. Resolve through the container explicitly — ``resolve_or_none`` + # returns ``None`` for mocks/test apps that never registered one. + self.dispatcher = dispatcher if dispatcher is not None else app.resolve_or_none(SSEEventDispatcher) def create_entry_points( self, job: Job, tool: "Tool", entry_points=Union[Iterable[dict[str, Any]], None], flush: bool = True @@ -198,6 +210,17 @@ def configure_entry_points( configured.append(ep) if configured: self.sa_session.commit() + # Fan out an SSE push so the user's browser can refresh the entry + # point list immediately instead of waiting for the 10 s poll. + # Anonymous jobs fall back to polling — ``push_to_user`` keys on + # user_id, and anonymous clients sit in the broadcast-only set. + if self.dispatcher is not None and job.user_id is not None: + try: + self.dispatcher.entry_point_update(user_id=job.user_id) + except Exception: + # The DB commit is authoritative; the SSE event is best + # effort. Never let a dispatch failure poison the caller. + log.exception("Failed to dispatch entry_point_update SSE event for job %s", job.id) return dict(not_configured=not_configured, configured=configured) def save_entry_point(self, entry_point: InteractiveToolEntryPoint) -> None: diff --git a/lib/galaxy/managers/notification.py b/lib/galaxy/managers/notification.py index 676660342531..18de7e795a59 100644 --- a/lib/galaxy/managers/notification.py +++ b/lib/galaxy/managers/notification.py @@ -59,8 +59,8 @@ NotificationBroadcastUpdateRequest, NotificationCategorySettings, NotificationChannelSettings, - NotificationCreatedResponse, NotificationCreateData, + NotificationCreatedResponse, NotificationCreateRequest, NotificationRecipients, NotificationResponse, diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py index 2c83a549e081..66b54bb002d5 100644 --- a/lib/galaxy/managers/sse.py +++ b/lib/galaxy/managers/sse.py @@ -20,6 +20,7 @@ ) from galaxy.model.orm.now import now +from galaxy.web.statsd_client import VanillaGalaxyStatsdClient log = logging.getLogger(__name__) @@ -78,10 +79,11 @@ class SSEConnectionManager: (typically the Kombu daemon thread via control task handlers). """ - def __init__(self) -> None: + def __init__(self, statsd_client: Optional[VanillaGalaxyStatsdClient] = None) -> None: self._connections: dict[int, set[asyncio.Queue]] = defaultdict(set) self._broadcast_connections: set[asyncio.Queue] = set() self._loop: Optional[asyncio.AbstractEventLoop] = None + self._statsd_client = statsd_client def _ensure_loop(self) -> None: """Capture the running asyncio event loop. Must be called from async context.""" @@ -148,13 +150,14 @@ def _safe_put(self, queue: asyncio.Queue, event: SSEEvent) -> None: # Event loop is closed or shutting down pass - @staticmethod - def _do_put(queue: asyncio.Queue, event: SSEEvent) -> None: + def _do_put(self, queue: asyncio.Queue, event: SSEEvent) -> None: """Runs ON the event loop thread. Safe to touch asyncio.Queue here.""" try: queue.put_nowait(event) except asyncio.QueueFull: log.warning("SSE queue full, dropping event: %s", event.event) + if self._statsd_client is not None: + self._statsd_client.incr("galaxy.sse.connections.dropped") @property def connected_user_ids(self) -> set[int]: @@ -164,6 +167,20 @@ def connected_user_ids(self) -> set[int]: def total_connections(self) -> int: return len(self._broadcast_connections) + @property + def total_broadcast_connections(self) -> int: + """Number of all active SSE connections (includes anonymous). + + Every connection is added to ``_broadcast_connections``; this is the + most accurate "SSE clients currently connected" gauge. + """ + return len(self._broadcast_connections) + + @property + def total_per_user_connections(self) -> int: + """Number of active SSE connections bound to a specific user_id.""" + return sum(len(queues) for queues in self._connections.values()) + # -- High-level streaming helper -- async def stream( diff --git a/lib/galaxy/managers/sse_dispatch.py b/lib/galaxy/managers/sse_dispatch.py index 8db0cb6e63aa..927cb1ab0d81 100644 --- a/lib/galaxy/managers/sse_dispatch.py +++ b/lib/galaxy/managers/sse_dispatch.py @@ -9,17 +9,24 @@ """ import logging +import threading +import time +from collections.abc import Callable from typing import ( Any, Optional, ) +from cachetools import TTLCache +from kombu import Queue + from galaxy.managers.sse import make_event_id from galaxy.queue_worker import ( ControlTask, GalaxyQueueWorker, ) from galaxy.queues import all_control_queues_for_declare +from galaxy.web.statsd_client import VanillaGalaxyStatsdClient from galaxy.web_stack import ApplicationStack log = logging.getLogger(__name__) @@ -32,31 +39,70 @@ class SSEEventDispatcher: without a full ``app``. ``queue_worker`` is ``Optional`` because unit-test mock apps and Galaxy configurations without AMQP don't construct one — the dispatcher silently no-ops in that case. + + ``statsd_client`` is optional — if ``None`` (statsd not configured), all + instrumentation becomes a cheap attribute-lookup no-op. """ + # TTL for the active-worker declare-queue cache. The WorkerProcess heartbeat + # writes every 60 s and ``all_control_queues_for_declare`` filters on a 120 s + # window, so a 30 s cache cannot produce a result that wasn't also valid in + # the non-cached call. Surfaced as a class constant so tests can monkey-patch. + _DECLARE_QUEUES_TTL_SECONDS = 30 + def __init__( self, queue_worker: Optional[GalaxyQueueWorker], application_stack: ApplicationStack, + statsd_client: Optional[VanillaGalaxyStatsdClient] = None, + clock: Callable[[], float] = time.monotonic, ) -> None: self._queue_worker = queue_worker self._application_stack = application_stack + self._statsd_client = statsd_client + self._clock = clock + self._declare_queues_cache: TTLCache = TTLCache(maxsize=1, ttl=self._DECLARE_QUEUES_TTL_SECONDS, timer=clock) + self._declare_queues_lock = threading.RLock() + + def _get_declare_queues(self) -> list[Queue]: + # Empty results (startup before DatabaseHeartbeat registers this process, + # or a transient DB error swallowed by ``all_control_queues_for_declare``) + # must not be pinned for the full TTL — they'd silently drop every SSE + # event until the next expiry. Only cache non-empty results. + with self._declare_queues_lock: + try: + return self._declare_queues_cache["webapp"] + except KeyError: + queues = all_control_queues_for_declare(self._application_stack, webapp_only=True) + if queues: + self._declare_queues_cache["webapp"] = queues + return queues def _send(self, task: str, kwargs: dict[str, Any]) -> None: if self._queue_worker is None: # AMQP not configured at all (e.g. unit-test mock app). Skip silently. log.debug("SSE dispatch skipped: no queue_worker configured (task=%s)", task) + if self._statsd_client is not None: + self._statsd_client.incr("galaxy.sse.dispatch.skipped_no_qw") return + if self._statsd_client is not None: + self._statsd_client.incr("galaxy.sse.dispatch.count", tags={"task": task}) # Only fan out to webapp processes — job handlers and workflow schedulers # don't have browser SSE connections to push to. - declare_queues = all_control_queues_for_declare(self._application_stack, webapp_only=True) + declare_queues = self._get_declare_queues() control_task = ControlTask(self._queue_worker) - control_task.send_task( - payload={"task": task, "kwargs": kwargs}, - routing_key="control.*", - expiration=10, - declare_queues=declare_queues, - ) + start_time = time.perf_counter() if self._statsd_client is not None else 0.0 + try: + control_task.send_task( + payload={"task": task, "kwargs": kwargs}, + routing_key="control.*", + expiration=10, + declare_queues=declare_queues, + ) + finally: + if self._statsd_client is not None: + dt_ms = int((time.perf_counter() - start_time) * 1000) + self._statsd_client.timing("galaxy.sse.dispatch.latency_ms", dt_ms, tags={"task": task}) def notify_users(self, user_ids: list[int], payload: str, event_id: Optional[str] = None) -> None: self._send( @@ -85,3 +131,18 @@ def history_update(self, user_updates: dict[str, list[int]], event_id: Optional[ "event_id": event_id or make_event_id(), }, ) + + def entry_point_update(self, user_id: int, event_id: Optional[str] = None) -> None: + """Fan out a wake-up ``entry_point_update`` event for one user. + + The client always refetches the canonical entry-point list on receipt, + so no IDs are sent — keeping the payload small and the dispatch path + free of per-event encoding work. + """ + self._send( + "entry_point_update", + { + "user_id": user_id, + "event_id": event_id or make_event_id(), + }, + ) diff --git a/lib/galaxy/model/unittest_utils/data_app.py b/lib/galaxy/model/unittest_utils/data_app.py index 0d424cca3623..2eb7e78bf3e7 100644 --- a/lib/galaxy/model/unittest_utils/data_app.py +++ b/lib/galaxy/model/unittest_utils/data_app.py @@ -9,6 +9,7 @@ import os import shutil import tempfile +from types import SimpleNamespace from typing import Optional from galaxy import ( @@ -105,6 +106,11 @@ def __init__(self, config: Optional[GalaxyDataTestConfig] = None, **kwd): model.setup_global_object_store_for_models(self.object_store) self.security_agent = self.model.security_agent self.tag_handler = GalaxyTagHandler(self.model.session) + # statsd/observability plumbing — stubbed out so paths that read + # ``app.execution_timer_factory.galaxy_statsd_client`` (e.g. the + # queue-worker instrumentation) degrade to a no-op if ever exercised + # against this mock instead of raising ``AttributeError``. + self.execution_timer_factory = SimpleNamespace(galaxy_statsd_client=None) self.init_datatypes() def init_datatypes(self): diff --git a/lib/galaxy/queue_worker/__init__.py b/lib/galaxy/queue_worker/__init__.py index 04824d8a8498..a01fc27efc91 100644 --- a/lib/galaxy/queue_worker/__init__.py +++ b/lib/galaxy/queue_worker/__init__.py @@ -13,8 +13,10 @@ import time from inspect import ismodule from typing import ( + cast, Optional, TYPE_CHECKING, + TypedDict, ) from kombu import ( @@ -48,6 +50,39 @@ ) +class NotifyUsersPayload(TypedDict, total=False): + """Wire contract for the ``notify_users`` control-task kwargs.""" + + user_ids: list[int] + payload: str + event_id: Optional[str] + + +class NotifyBroadcastPayload(TypedDict, total=False): + """Wire contract for the ``notify_broadcast`` control-task kwargs.""" + + payload: str + event_id: Optional[str] + + +class HistoryUpdatePayload(TypedDict, total=False): + """Wire contract for the ``history_update`` control-task kwargs. + + ``user_updates`` maps stringified user IDs to lists of (unencoded) history IDs. + Stringified because AMQP JSON serialization coerces dict keys to strings. + """ + + user_updates: dict[str, list[int]] + event_id: Optional[str] + + +class EntryPointUpdatePayload(TypedDict, total=False): + """Wire contract for the ``entry_point_update`` control-task kwargs.""" + + user_id: int + event_id: Optional[str] + + def send_local_control_task( app: "StructuredApp", task: str, @@ -361,21 +396,26 @@ def admin_job_lock(app, **kwargs): def notify_users(app: "MinimalManagerApp", **kwargs) -> None: """Push SSE events to connected users on this worker process.""" + payload = cast(NotifyUsersPayload, kwargs) sse_manager = app[SSEConnectionManager] - user_ids = kwargs.get("user_ids", []) - payload = kwargs.get("payload", "{}") - event_id = kwargs.get("event_id") - event = SSEEvent(event="notification_update", data=payload, id=event_id) - for user_id in user_ids: + event = SSEEvent( + event="notification_update", + data=payload.get("payload", "{}"), + id=payload.get("event_id"), + ) + for user_id in payload.get("user_ids", []): sse_manager.push_to_user(user_id, event) def notify_broadcast(app: "MinimalManagerApp", **kwargs) -> None: """Push SSE broadcast events to all connected clients on this worker process.""" + payload = cast(NotifyBroadcastPayload, kwargs) sse_manager = app[SSEConnectionManager] - payload = kwargs.get("payload", "{}") - event_id = kwargs.get("event_id") - event = SSEEvent(event="broadcast_update", data=payload, id=event_id) + event = SSEEvent( + event="broadcast_update", + data=payload.get("payload", "{}"), + id=payload.get("event_id"), + ) sse_manager.push_broadcast(event) @@ -385,11 +425,11 @@ def history_update(app: "MinimalManagerApp", **kwargs) -> None: Encodes integer history IDs here (not in the monitor) so the manager layer stays free of presentation/security concerns. """ + payload = cast(HistoryUpdatePayload, kwargs) sse_manager = app[SSEConnectionManager] - user_updates = kwargs.get("user_updates", {}) - event_id = kwargs.get("event_id") + event_id = payload.get("event_id") encode = app.security.encode_id - for user_id_str, history_ids in user_updates.items(): + for user_id_str, history_ids in payload.get("user_updates", {}).items(): user_id = int(user_id_str) encoded_ids = [encode(hid) for hid in history_ids] data = json.dumps({"history_ids": encoded_ids}) @@ -397,6 +437,20 @@ def history_update(app: "MinimalManagerApp", **kwargs) -> None: sse_manager.push_to_user(user_id, event) +def entry_point_update(app: "MinimalManagerApp", **kwargs) -> None: + """Push a wake-up SSE event to a single connected user. + + The payload is empty by design: the client refetches ``/api/entry_points`` + (the canonical source) on receipt, so there's nothing to narrow or merge. + Dropping the IDs from the payload also avoids per-event ``encode_id`` work + at 1000+ events/s. + """ + payload = cast(EntryPointUpdatePayload, kwargs) + sse_manager = app[SSEConnectionManager] + event = SSEEvent(event="entry_point_update", data="{}", id=payload.get("event_id")) + sse_manager.push_to_user(int(payload["user_id"]), event) + + control_message_to_task = { "create_panel_section": create_panel_section, "reload_tool": reload_tool, @@ -415,6 +469,7 @@ def history_update(app: "MinimalManagerApp", **kwargs) -> None: "notify_users": notify_users, "notify_broadcast": notify_broadcast, "history_update": history_update, + "entry_point_update": entry_point_update, } @@ -504,8 +559,14 @@ def get_consumers(self, Consumer, channel): def process_task(self, body, message): result = "NO_RESULT" + task_name = body.get("task") + statsd_client = self.app.execution_timer_factory.galaxy_statsd_client + if statsd_client is not None and task_name is not None: + statsd_client.incr("galaxy.control_queue.task.count", tags={"task": task_name}) if body["task"] in self.task_mapping: if body.get("noop", None) != self.app.config.server_name: + outcome = "ok" + handler_start = time.perf_counter() if statsd_client is not None else 0.0 try: f = self.task_mapping[body["task"]] if message.headers.get("epoch", math.inf) > self.epoch: @@ -526,7 +587,16 @@ def process_task(self, body, message): result = "NO_OP" except Exception: # this shouldn't ever throw an exception, but... + outcome = "error" log.exception("Error running control task type: %s", body["task"]) + finally: + if statsd_client is not None and task_name is not None: + dt_ms = int((time.perf_counter() - handler_start) * 1000) + statsd_client.timing( + "galaxy.control_queue.task.latency_ms", + dt_ms, + tags={"task": task_name, "outcome": outcome}, + ) else: result = "NO_OP" else: diff --git a/lib/galaxy/structured_app/__init__.py b/lib/galaxy/structured_app/__init__.py index af78cf03d31c..fc342739f5fe 100644 --- a/lib/galaxy/structured_app/__init__.py +++ b/lib/galaxy/structured_app/__init__.py @@ -176,6 +176,7 @@ class StructuredApp(MinimalManagerApp): vault: Vault webhooks_registry: WebhooksRegistry queue_worker: Any # 'galaxy.queue_worker.GalaxyQueueWorker' + execution_timer_factory: Any # 'galaxy.app.ExecutionTimerFactory' data_provider_registry: Any # 'galaxy.visualization.data_providers.registry.DataProviderRegistry' tool_cache: "ToolCache" tool_shed_repository_cache: Optional[ToolShedRepositoryCache] diff --git a/lib/galaxy/webapps/galaxy/api/tool_data.py b/lib/galaxy/webapps/galaxy/api/tool_data.py index c7ae44f45aec..3d1f1a9abb8f 100644 --- a/lib/galaxy/webapps/galaxy/api/tool_data.py +++ b/lib/galaxy/webapps/galaxy/api/tool_data.py @@ -11,6 +11,7 @@ Field, ) +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import import_data_bundle from galaxy.managers.context import ProvidesUserContext from galaxy.managers.tool_data import ToolDataManager @@ -25,7 +26,6 @@ ToolDataItem, ) from galaxy.webapps.base.api import GalaxyFileResponse -from galaxy.webapps.galaxy.services.base import async_task_summary from . import ( depends, DependsOnTrans, diff --git a/lib/galaxy/webapps/galaxy/metrics/__init__.py b/lib/galaxy/webapps/galaxy/metrics/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/lib/galaxy/webapps/galaxy/metrics/queue_metrics.py b/lib/galaxy/webapps/galaxy/metrics/queue_metrics.py new file mode 100644 index 000000000000..c45dcfd71105 --- /dev/null +++ b/lib/galaxy/webapps/galaxy/metrics/queue_metrics.py @@ -0,0 +1,147 @@ +"""Periodic gauge emitter for control-queue depth and SSE-connection counts. + +Scheduled by Celery beat (see ``galaxy.celery.__init__.setup_periodic_tasks``) +at a fixed cadence. Opens a short-lived kombu connection, iterates the control +queues returned by ``all_control_queues_for_declare`` and samples each queue's +message-count via a passive declare. Also samples in-memory connection counts +from ``SSEConnectionManager`` and the active-``WorkerProcess`` count from the +database. + +All instrumentation no-ops when ``app.execution_timer_factory.galaxy_statsd_client`` +is ``None`` — i.e. statsd isn't configured. +""" + +import datetime +import logging +from collections import defaultdict +from typing import TYPE_CHECKING + +from lagom.exceptions import UnresolvableType +from sqlalchemy import ( + func, + select, +) + +from galaxy.managers.sse import SSEConnectionManager +from galaxy.model import WorkerProcess +from galaxy.model.orm.now import now +from galaxy.queues import ( + all_control_queues_for_declare, + DEFAULT_ACTIVE_PROCESS_WINDOW_SECONDS, +) + +if TYPE_CHECKING: + from galaxy.structured_app import StructuredApp + from galaxy.web.statsd_client import VanillaGalaxyStatsdClient + +log = logging.getLogger(__name__) + + +def emit_sse_connection_gauges( + statsd_client: "VanillaGalaxyStatsdClient", + sse_manager: SSEConnectionManager, +) -> None: + """Emit ``galaxy.sse.connections.active`` gauges by connection kind.""" + statsd_client.timing( + "galaxy.sse.connections.active", + sse_manager.total_broadcast_connections, + tags={"kind": "broadcast"}, + ) + statsd_client.timing( + "galaxy.sse.connections.active", + sse_manager.total_per_user_connections, + tags={"kind": "per_user"}, + ) + + +def emit_control_queue_depth( + statsd_client: "VanillaGalaxyStatsdClient", + app: "StructuredApp", +) -> None: + """Emit ``galaxy.control_queue.depth`` per active webapp/handler queue. + + A per-queue passive declare can fail on transports that don't implement it + (e.g. the sqlalchemy kombu transport) or for queues that don't yet exist on + the broker. Those are expected and quiet — logged at DEBUG, no metric, move + on. Errors at the broker-connection layer propagate up so the caller can + surface them. + """ + connection = app.amqp_internal_connection_obj + if connection is None: + return + queues = all_control_queues_for_declare(app.application_stack) + if not queues: + return + with connection.clone() as conn: + channel = conn.channel() + try: + for queue in queues: + try: + declared = queue.bind(channel).queue_declare(passive=True) + except Exception: + log.debug( + "queue_metrics: passive declare failed for %s", + queue.name, + exc_info=True, + ) + continue + statsd_client.timing( + "galaxy.control_queue.depth", + declared.message_count, + tags={"queue_name": queue.name}, + ) + finally: + channel.close() + + +def emit_worker_process_gauge( + statsd_client: "VanillaGalaxyStatsdClient", + app: "StructuredApp", +) -> None: + """Emit ``galaxy.worker_process.active`` gauge grouped by ``app_type``.""" + cutoff = now() - datetime.timedelta(seconds=DEFAULT_ACTIVE_PROCESS_WINDOW_SECONDS) + stmt = ( + select(WorkerProcess.app_type, func.count(WorkerProcess.id)) + .where(WorkerProcess.update_time > cutoff) + .group_by(WorkerProcess.app_type) + ) + counts: dict[str, int] = defaultdict(int) + with app.model.new_session() as session: + for app_type, count in session.execute(stmt): + counts[app_type or "unknown"] = int(count) + for app_type, count in counts.items(): + statsd_client.timing( + "galaxy.worker_process.active", + count, + tags={"app_type": app_type}, + ) + + +def _run(name: str, statsd_client: "VanillaGalaxyStatsdClient", fn) -> None: + """Run a sub-emitter, isolating its failures. + + A broken sub-emitter logs once at WARNING and increments + ``galaxy.queue_metrics.error`` (tagged by emitter name) so the failure is + observable in metrics without the Celery-beat wrapper logging on every + tick. The other sub-emitters continue to run on this tick. + """ + try: + fn() + except Exception: + log.warning("queue_metrics: %s emitter failed", name, exc_info=True) + statsd_client.incr("galaxy.queue_metrics.error", tags={"emitter": name}) + + +def emit_queue_metrics(app: "StructuredApp") -> None: + """Periodic entry-point — no-ops when statsd isn't configured.""" + statsd_client = app.execution_timer_factory.galaxy_statsd_client + if statsd_client is None: + return + try: + sse_manager = app[SSEConnectionManager] + except UnresolvableType: + sse_manager = None + if sse_manager is not None: + _run("sse_connections", statsd_client, lambda: emit_sse_connection_gauges(statsd_client, sse_manager)) + _run("control_queue_depth", statsd_client, lambda: emit_control_queue_depth(statsd_client, app)) + _run("worker_process", statsd_client, lambda: emit_worker_process_gauge(statsd_client, app)) diff --git a/lib/galaxy/webapps/galaxy/services/base.py b/lib/galaxy/webapps/galaxy/services/base.py index d485d277d5ba..e86924966186 100644 --- a/lib/galaxy/webapps/galaxy/services/base.py +++ b/lib/galaxy/webapps/galaxy/services/base.py @@ -7,7 +7,6 @@ Optional, ) -from galaxy.celery.helpers import async_task_summary as async_task_summary # re-export for existing callers from galaxy.exceptions import ( AuthenticationRequired, ConfigDoesNotAllowException, diff --git a/lib/galaxy/webapps/galaxy/services/datasets.py b/lib/galaxy/webapps/galaxy/services/datasets.py index aae67a7ec74b..d3f7f8ee8b3a 100644 --- a/lib/galaxy/webapps/galaxy/services/datasets.py +++ b/lib/galaxy/webapps/galaxy/services/datasets.py @@ -24,6 +24,7 @@ util, web, ) +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import compute_dataset_hash from galaxy.datatypes.binary import Binary from galaxy.datatypes.dataproviders.exceptions import NoProviderAvailable @@ -88,10 +89,7 @@ ) from galaxy.visualization.data_providers.registry import DataProviderRegistry from galaxy.webapps.base.controller import UsesVisualizationMixin -from galaxy.webapps.galaxy.services.base import ( - async_task_summary, - ServiceBase, -) +from galaxy.webapps.galaxy.services.base import ServiceBase log = logging.getLogger(__name__) diff --git a/lib/galaxy/webapps/galaxy/services/histories.py b/lib/galaxy/webapps/galaxy/services/histories.py index 25e8982da5df..097fbc1c96f4 100644 --- a/lib/galaxy/webapps/galaxy/services/histories.py +++ b/lib/galaxy/webapps/galaxy/services/histories.py @@ -24,6 +24,7 @@ exceptions as glx_exceptions, model, ) +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import ( import_model_store, prepare_history_download, @@ -87,7 +88,6 @@ from galaxy.short_term_storage import ShortTermStorageAllocator from galaxy.util import restore_text from galaxy.webapps.galaxy.services.base import ( - async_task_summary, ConsumesModelStores, model_store_storage_target, ServesExportStores, diff --git a/lib/galaxy/webapps/galaxy/services/history_contents.py b/lib/galaxy/webapps/galaxy/services/history_contents.py index 581d1dc772e7..53926007bd31 100644 --- a/lib/galaxy/webapps/galaxy/services/history_contents.py +++ b/lib/galaxy/webapps/galaxy/services/history_contents.py @@ -21,6 +21,7 @@ ) from galaxy import exceptions +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import ( change_datatype, materialize as materialize_task, @@ -118,7 +119,6 @@ from galaxy.short_term_storage import ShortTermStorageAllocator from galaxy.util.zipstream import ZipstreamWrapper from galaxy.webapps.galaxy.services.base import ( - async_task_summary, ConsumesModelStores, ensure_celery_tasks_enabled, model_store_storage_target, diff --git a/lib/galaxy/webapps/galaxy/services/invocations.py b/lib/galaxy/webapps/galaxy/services/invocations.py index 29654d885a41..dc5cc27a5770 100644 --- a/lib/galaxy/webapps/galaxy/services/invocations.py +++ b/lib/galaxy/webapps/galaxy/services/invocations.py @@ -6,6 +6,7 @@ from pydantic import Field +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import ( prepare_invocation_download, write_invocation_to, @@ -61,7 +62,6 @@ from galaxy.security.idencoding import IdEncodingHelper from galaxy.short_term_storage import ShortTermStorageAllocator from galaxy.webapps.galaxy.services.base import ( - async_task_summary, ConsumesModelStores, ensure_celery_tasks_enabled, model_store_storage_target, diff --git a/lib/galaxy/webapps/galaxy/services/jobs.py b/lib/galaxy/webapps/galaxy/services/jobs.py index fac6ea9ecf37..daadbe37b13a 100644 --- a/lib/galaxy/webapps/galaxy/services/jobs.py +++ b/lib/galaxy/webapps/galaxy/services/jobs.py @@ -16,6 +16,7 @@ exceptions, model, ) +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import queue_jobs from galaxy.managers import hdas from galaxy.managers.base import security_check @@ -60,7 +61,6 @@ ToolParameterBundleModel, ) from galaxy.webapps.galaxy.services.base import ( - async_task_summary, ServiceBase, ) from .tools import validate_tool_for_running diff --git a/lib/galaxy/webapps/galaxy/services/pages.py b/lib/galaxy/webapps/galaxy/services/pages.py index d77560efd272..e95e6dd959de 100644 --- a/lib/galaxy/webapps/galaxy/services/pages.py +++ b/lib/galaxy/webapps/galaxy/services/pages.py @@ -4,6 +4,7 @@ ) from galaxy import exceptions +from galaxy.celery.helpers import async_task_summary from galaxy.celery.tasks import prepare_pdf_download from galaxy.managers import base from galaxy.managers.markdown_util import ( @@ -34,7 +35,6 @@ from galaxy.short_term_storage import ShortTermStorageAllocator from galaxy.webapps.galaxy.api.common import PageIdPathParam from galaxy.webapps.galaxy.services.base import ( - async_task_summary, ensure_celery_tasks_enabled, ServiceBase, ) diff --git a/lib/galaxy/webapps/galaxy/services/users.py b/lib/galaxy/webapps/galaxy/services/users.py index 39aa35c03a45..dd8fa7650c12 100644 --- a/lib/galaxy/webapps/galaxy/services/users.py +++ b/lib/galaxy/webapps/galaxy/services/users.py @@ -9,6 +9,7 @@ exceptions as glx_exceptions, util, ) +from galaxy.celery.helpers import async_task_summary from galaxy.managers import api_keys from galaxy.managers.context import ( ProvidesHistoryContext, @@ -34,10 +35,7 @@ UserModel, ) from galaxy.security.idencoding import IdEncodingHelper -from galaxy.webapps.galaxy.services.base import ( - async_task_summary, - ServiceBase, -) +from galaxy.webapps.galaxy.services.base import ServiceBase from galaxy.webapps.galaxy.services.roles import role_to_model if TYPE_CHECKING: diff --git a/test/integration/test_entry_point_sse.py b/test/integration/test_entry_point_sse.py new file mode 100644 index 000000000000..ead6e24b10ec --- /dev/null +++ b/test/integration/test_entry_point_sse.py @@ -0,0 +1,158 @@ +"""Integration tests for SSE-based interactive-tool entry-point update notifications. + +Mirrors ``test_history_sse.py``. Rather than spin up a real containerized +interactive tool (which would require docker), these tests exercise the +dispatch path by creating a ``Job`` and ``InteractiveToolEntryPoint`` rows +directly via the live app's SQLAlchemy session and invoking +``InteractiveToolManager.configure_entry_points`` with a stub ``ports_dict``. +This is exactly the moment the event fires in production — the job runner's +port-routing hook is the only upstream caller, and the SSE dispatch happens +after the DB commit regardless of how the ports were obtained. +""" + +import time +from urllib.parse import urljoin +from uuid import uuid4 + +from galaxy.model import ( + InteractiveToolEntryPoint, + Job, +) +from galaxy_test.base.populators import DatasetPopulator +from galaxy_test.base.sse import SSELineListener +from galaxy_test.driver.integration_util import IntegrationTestCase + + +def _make_ports_dict(tool_port: int) -> dict: + """Stub the runner's port-routing payload — one tool_port, fake host/proto.""" + return { + str(tool_port): { + "host": "host.invalid", + "port": 12345, + "protocol": "http", + } + } + + +class TestEntryPointSSEIntegration(IntegrationTestCase): + dataset_populator: DatasetPopulator + framework_tool_and_types = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + super().handle_galaxy_config_kwds(config) + config["enable_celery_tasks"] = False + + def setUp(self): + super().setUp() + self.dataset_populator = DatasetPopulator(self.galaxy_interactor) + + def _events_stream_url(self) -> str: + return urljoin(self.url, "api/events/stream") + + def _user_id_for_api_key(self, api_key: str) -> int: + """Return the integer ``User.id`` for the user owning ``api_key``.""" + # The ``/api/users/current`` endpoint returns the encoded id; decode + # via the app's security helper so we get the raw int the job row + # needs. + import requests + + response = requests.get(urljoin(self.url, "api/users/current"), params={"key": api_key}) + response.raise_for_status() + encoded_id = response.json()["id"] + return self._app.security.decode_id(encoded_id) + + def _create_it_job_with_entry_point(self, user_id: int, tool_port: int = 8888) -> tuple[int, int]: + """Create a minimal Job + unconfigured InteractiveToolEntryPoint row pair. + + Returns ``(job_id, entry_point_id)``. The session used is the live + app's; the rows are real and survive the call. + """ + sa_session = self._app.model.context + job = Job() + job.user_id = user_id + job.tool_id = "interactivetool_simple" + job.state = Job.states.RUNNING + sa_session.add(job) + sa_session.flush() + ep = InteractiveToolEntryPoint( + job=job, + tool_port=tool_port, + entry_url="/", + name="test entry point", + label="test", + requires_domain=True, + requires_path_in_url=False, + requires_path_in_header_named=None, + ) + sa_session.add(ep) + sa_session.commit() + return job.id, ep.id + + def test_entry_point_update_event_fires_on_configure(self): + """configure_entry_points should fire an ``entry_point_update`` wake-up event.""" + api_key = self.galaxy_interactor.api_key + assert api_key is not None + user_id = self._user_id_for_api_key(api_key) + job_id, _ = self._create_it_job_with_entry_point(user_id) + + listener = SSELineListener(self._events_stream_url(), api_key) + listener.start() + try: + sa_session = self._app.model.context + job = sa_session.get(Job, job_id) + assert job is not None + self._app.interactivetool_manager.configure_entry_points(job, _make_ports_dict(8888)) + + entry_point_events = listener.wait_for_event("entry_point_update") + finally: + listener.stop() + + # The event carries no payload: the client refetches ``/api/entry_points`` + # (the canonical source) on receipt, so the event just needs to arrive. + assert len(entry_point_events) >= 1, f"Expected entry_point_update wake-up, got: {entry_point_events}" + assert entry_point_events[0]["event"] == "entry_point_update" + + def test_entry_point_update_is_scoped_to_owning_user(self): + """User A must not see entry_point_update events for user B's jobs. + + The event has no payload to cross-check with, so we assert on event + count: user A's stream should receive exactly one event for its own + ``configure_entry_points`` call and none for user B's. + """ + user_b = self._setup_user(f"{uuid4()}@galaxy.test") + _, user_b_api_key = self._setup_user_get_key(user_b["email"]) + user_b_id = self._user_id_for_api_key(user_b_api_key) + + user_a_api_key = self.galaxy_interactor.api_key + assert user_a_api_key is not None + user_a_id = self._user_id_for_api_key(user_a_api_key) + + job_a_id, _ = self._create_it_job_with_entry_point(user_a_id, tool_port=7001) + job_b_id, _ = self._create_it_job_with_entry_point(user_b_id, tool_port=7002) + + listener = SSELineListener(self._events_stream_url(), user_a_api_key) + listener.start() + try: + sa_session = self._app.model.context + job_b = sa_session.get(Job, job_b_id) + assert job_b is not None + # User B's job — user A must NOT see this. + self._app.interactivetool_manager.configure_entry_points(job_b, _make_ports_dict(7002)) + # Give the broker a moment so a leaked event (if any) would arrive + # before we fire user A's event — the assertion would then catch + # more than one event on user A's stream. + time.sleep(0.5) + + job_a = sa_session.get(Job, job_a_id) + assert job_a is not None + # User A's own job — this is what A's stream must observe. + self._app.interactivetool_manager.configure_entry_points(job_a, _make_ports_dict(7001)) + + entry_point_events = listener.wait_for_event("entry_point_update") + finally: + listener.stop() + + assert ( + len(entry_point_events) == 1 + ), f"User A expected exactly one entry_point_update (own job); saw {len(entry_point_events)}: {entry_point_events}" diff --git a/test/integration_selenium/test_entry_point_sse.py b/test/integration_selenium/test_entry_point_sse.py new file mode 100644 index 000000000000..e661cb0a79fb --- /dev/null +++ b/test/integration_selenium/test_entry_point_sse.py @@ -0,0 +1,107 @@ +"""Playwright E2E test for the interactive-tool entry-point SSE pipeline. + +Verifies that when an interactive-tool entry point transitions to ``configured`` +server-side (the runner's port-routing hook calls ``configure_entry_points``), +a logged-in user's browser receives the ``entry_point_update`` SSE event and +the entry-point store refetches, without the 10 s polling interval. + +This test stubs the server-side runner callback: it creates the Job and entry +point directly and calls ``InteractiveToolManager.configure_entry_points`` on +the live app. That invocation is the exact dispatch site in production. +""" + +from uuid import uuid4 + +from galaxy.util.wait import wait_on +from galaxy_test.selenium.framework import ( + managed_history, + selenium_test, +) +from .framework import SeleniumIntegrationTestCase + +SSE_CONNECT_TIMEOUT_SECONDS = 15 +SSE_EVENT_TIMEOUT_SECONDS = 15 + + +class TestEntryPointSSESeleniumIntegration(SeleniumIntegrationTestCase): + ensure_registered = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + super().handle_galaxy_config_kwds(config) + config["enable_celery_tasks"] = False + + def _wait_for_sse_connected(self) -> None: + """Block until the frontend confirms the SSE pipeline is live.""" + wait_on( + lambda: True if self.driver.execute_script("return window.__galaxy_sse_connected === true") else None, + "window.__galaxy_sse_connected === true", + timeout=SSE_CONNECT_TIMEOUT_SECONDS, + ) + + def _last_sse_event_ts(self) -> int: + return self.driver.execute_script("return window.__galaxy_sse_last_event_ts || 0") or 0 + + def _wait_for_sse_event_after(self, baseline_ts: int) -> None: + wait_on( + lambda: True if self._last_sse_event_ts() > baseline_ts else None, + "window.__galaxy_sse_last_event_ts advanced past baseline", + timeout=SSE_EVENT_TIMEOUT_SECONDS, + ) + + def _create_it_job_with_entry_point(self, tool_port: int = 8888) -> tuple[int, int]: + from galaxy.model import ( + InteractiveToolEntryPoint, + Job, + ) + + user_info = self._get("users/current").json() + user_id = self._app.security.decode_id(user_info["id"]) + sa_session = self._app.model.context + job = Job() + job.user_id = user_id + job.tool_id = "interactivetool_simple" + job.state = Job.states.RUNNING + sa_session.add(job) + sa_session.flush() + ep = InteractiveToolEntryPoint( + job=job, + tool_port=tool_port, + entry_url="/", + name=f"selenium entry {uuid4()}", + label="selenium", + requires_domain=True, + requires_path_in_url=False, + requires_path_in_header_named=None, + ) + sa_session.add(ep) + sa_session.commit() + return job.id, ep.id + + @selenium_test + @managed_history + def test_entry_point_update_pushed_via_sse(self): + """configure_entry_points should trigger an SSE push the client observes.""" + # Navigate home so the entry-point store is mounted and subscribed. + self.home() + self._wait_for_sse_connected() + baseline_ts = self._last_sse_event_ts() + self.screenshot("entry_point_sse_before") + + job_id, _ep_id = self._create_it_job_with_entry_point(tool_port=8888) + + # Stub the runner hook: call configure_entry_points on the live app. + from galaxy.model import Job + + sa_session = self._app.model.context + job = sa_session.get(Job, job_id) + assert job is not None + self._app.interactivetool_manager.configure_entry_points( + job, + {"8888": {"host": "host.invalid", "port": 12345, "protocol": "http"}}, + ) + + # Prove the update arrived via SSE (not polling): the composable's + # event-timestamp hook only advances when useSSE's listener fires. + self._wait_for_sse_event_after(baseline_ts) + self.screenshot("entry_point_sse_after") diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index 715898bbd2eb..cae3b6e7b6ba 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -129,7 +129,5 @@ def test_notification_bell_updates_via_sse(self): self._wait_for_sse_event_after(baseline_ts) # The indicator dot should appear on the bell (within the #activity-notifications element) - self.wait_for_selector_visible( - "#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000 - ) + self.wait_for_selector_visible("#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) self.screenshot("notification_bell_indicator") diff --git a/test/unit/app/managers/test_queue_metrics.py b/test/unit/app/managers/test_queue_metrics.py new file mode 100644 index 000000000000..fddaa576924d --- /dev/null +++ b/test/unit/app/managers/test_queue_metrics.py @@ -0,0 +1,243 @@ +"""Unit tests for :mod:`galaxy.webapps.galaxy.metrics.queue_metrics`. + +The DI container (``app``), the ``SSEConnectionManager``, and the kombu +connection are small hand-built fakes — no broker or database is required. +Assertions are on the recorded state of the statsd client (counters, timings) +rather than on mock call-lists so a regression that stops emitting a gauge +fails the test for the right reason. + +The failure-isolation test drives real sub-emitters into their error paths by +handing them genuinely broken collaborators (a connection whose ``clone()`` +raises, a model whose ``new_session()`` raises). That way the test exercises +the real ``_run`` wrapper rather than asserting a monkey-patched side_effect. +""" + +from dataclasses import ( + dataclass, + field, +) +from types import SimpleNamespace +from typing import ( + cast, + Optional, +) +from unittest.mock import MagicMock + +import pytest + +from galaxy.managers.sse import SSEConnectionManager +from galaxy.structured_app import StructuredApp +from galaxy.web.statsd_client import VanillaGalaxyStatsdClient +from galaxy.webapps.galaxy.metrics import queue_metrics + +# --------------------------------------------------------------------------- +# Fakes +# --------------------------------------------------------------------------- + + +@dataclass +class FakeStatsdClient: + """In-memory statsd recorder — see docstring in ``test_sse_dispatch.py``.""" + + counters: dict[tuple[str, tuple[tuple[str, str], ...]], int] = field(default_factory=dict) + timings: list[tuple[str, float, tuple[tuple[str, str], ...]]] = field(default_factory=list) + + def incr(self, metric: str, tags: Optional[dict[str, str]] = None) -> None: + key = (metric, tuple(sorted((tags or {}).items()))) + self.counters[key] = self.counters.get(key, 0) + 1 + + def timing(self, metric: str, value: float, tags: Optional[dict[str, str]] = None) -> None: + self.timings.append((metric, value, tuple(sorted((tags or {}).items())))) + + def counter(self, metric: str, tags: Optional[dict[str, str]] = None) -> int: + return self.counters.get((metric, tuple(sorted((tags or {}).items()))), 0) + + def timings_for(self, metric: str) -> list[tuple[float, dict[str, str]]]: + return [(v, dict(t)) for m, v, t in self.timings if m == metric] + + +class _ContainerApp: + """Tiny stand-in for ``StructuredApp`` + the Lagom container. + + Supports ``app[ClassName]`` lookup for ``SSEConnectionManager`` and + arbitrary attribute access. + """ + + def __init__(self, **attrs): + self._container: dict[type, object] = {} + for k, v in attrs.items(): + setattr(self, k, v) + + def register(self, cls, instance): + self._container[cls] = instance + + def __getitem__(self, cls): + return self._container[cls] + + +def _fake_sse_manager(broadcast: int = 3, per_user: int = 5): + m = MagicMock(spec=SSEConnectionManager) + m.total_broadcast_connections = broadcast + m.total_per_user_connections = per_user + return m + + +def _make_fake_queue(name: str, count: int): + """Fake kombu Queue exposing ``.bind(channel).queue_declare(passive=True).message_count``.""" + declared = SimpleNamespace(message_count=count) + bound = SimpleNamespace(queue_declare=lambda passive: declared) + return SimpleNamespace(name=name, bind=lambda channel: bound) + + +def _make_fake_connection(channel=None): + """Fake kombu Connection whose ``.clone()`` is usable as a context manager.""" + channel = channel or MagicMock() + conn_cm = MagicMock() + conn_cm.channel.return_value = channel + cm = MagicMock() + cm.__enter__ = lambda self: conn_cm + cm.__exit__ = lambda self, *a: False + connection = MagicMock() + connection.clone.return_value = cm + return connection + + +# --------------------------------------------------------------------------- +# Sub-emitter tests — assert on recorded state +# --------------------------------------------------------------------------- + + +def test_emit_sse_connection_gauges_emits_both_kinds(): + statsd = FakeStatsdClient() + queue_metrics.emit_sse_connection_gauges( + cast(VanillaGalaxyStatsdClient, statsd), _fake_sse_manager(broadcast=4, per_user=7) + ) + + assert statsd.timings_for("galaxy.sse.connections.active") == [ + (4, {"kind": "broadcast"}), + (7, {"kind": "per_user"}), + ] + + +def test_emit_control_queue_depth_emits_per_queue(monkeypatch): + statsd = FakeStatsdClient() + fake_queues = [ + _make_fake_queue("control.main@h", 3), + _make_fake_queue("control.main.1@h", 0), + ] + monkeypatch.setattr( + queue_metrics, + "all_control_queues_for_declare", + lambda application_stack: fake_queues, + ) + + app = _ContainerApp( + amqp_internal_connection_obj=_make_fake_connection(), + application_stack=MagicMock(), + ) + queue_metrics.emit_control_queue_depth(cast(VanillaGalaxyStatsdClient, statsd), cast(StructuredApp, app)) + + assert statsd.timings_for("galaxy.control_queue.depth") == [ + (3, {"queue_name": "control.main@h"}), + (0, {"queue_name": "control.main.1@h"}), + ] + + +def test_emit_control_queue_depth_skips_failed_passive_declare(monkeypatch): + """One bad queue → we skip it and keep going for the rest.""" + statsd = FakeStatsdClient() + + def bad_declare(passive): + raise RuntimeError("queue does not exist yet") + + good_queue = _make_fake_queue("control.good@h", 9) + bad_queue = SimpleNamespace( + name="control.bad@h", + bind=lambda channel: SimpleNamespace(queue_declare=bad_declare), + ) + monkeypatch.setattr(queue_metrics, "all_control_queues_for_declare", lambda stack: [good_queue, bad_queue]) + + app = _ContainerApp( + amqp_internal_connection_obj=_make_fake_connection(), + application_stack=MagicMock(), + ) + queue_metrics.emit_control_queue_depth(cast(VanillaGalaxyStatsdClient, statsd), cast(StructuredApp, app)) + + assert statsd.timings_for("galaxy.control_queue.depth") == [ + (9, {"queue_name": "control.good@h"}), + ] + + +def test_emit_control_queue_depth_no_broker_connection_is_noop(): + statsd = FakeStatsdClient() + app = _ContainerApp(amqp_internal_connection_obj=None, application_stack=MagicMock()) + queue_metrics.emit_control_queue_depth(cast(VanillaGalaxyStatsdClient, statsd), cast(StructuredApp, app)) + assert statsd.timings == [] + + +# --------------------------------------------------------------------------- +# emit_queue_metrics — aggregate entry-point +# --------------------------------------------------------------------------- + + +def test_emit_queue_metrics_is_silent_when_statsd_is_none(): + """No statsd client → every sub-call is skipped, no DB or broker access.""" + app = _ContainerApp( + execution_timer_factory=SimpleNamespace(galaxy_statsd_client=None), + ) + # Would raise AttributeError if the short-circuit didn't fire before any + # real collaborator was touched. + queue_metrics.emit_queue_metrics(cast(StructuredApp, app)) + + +def test_emit_queue_metrics_isolates_real_subemitter_failures(monkeypatch): + """When real sub-emitters raise, ``_run`` contains the failure and logs an error counter. + + We drive the failures through the actual sub-emitter bodies — not monkey- + patched side_effects — by handing in a connection whose ``.clone()`` raises + (for ``emit_control_queue_depth``) and a model whose ``.new_session()`` + raises (for ``emit_worker_process_gauge``). The SSE gauge is given healthy + collaborators and should still land. + """ + statsd = FakeStatsdClient() + sse_manager = _fake_sse_manager(broadcast=2, per_user=1) + + broken_connection = MagicMock() + broken_connection.clone.side_effect = RuntimeError("broker is gone") + + broken_model = MagicMock() + broken_model.new_session.side_effect = RuntimeError("db is gone") + + # Ensure the broker path reaches .clone() rather than short-circuiting on + # an empty queue list. + monkeypatch.setattr( + queue_metrics, + "all_control_queues_for_declare", + lambda stack: [_make_fake_queue("control.main@h", 0)], + ) + + app = _ContainerApp( + execution_timer_factory=SimpleNamespace(galaxy_statsd_client=statsd), + amqp_internal_connection_obj=broken_connection, + application_stack=MagicMock(), + model=broken_model, + ) + app.register(SSEConnectionManager, sse_manager) + + # Must not raise — the SSE gauge still lands. + queue_metrics.emit_queue_metrics(cast(StructuredApp, app)) + + # SSE gauge landed despite the other two failing. + sse_timings = statsd.timings_for("galaxy.sse.connections.active") + assert (2, {"kind": "broadcast"}) in sse_timings + assert (1, {"kind": "per_user"}) in sse_timings + + # Each failing sub-emitter bumped its error counter tagged by name. + assert statsd.counter("galaxy.queue_metrics.error", {"emitter": "control_queue_depth"}) == 1 + assert statsd.counter("galaxy.queue_metrics.error", {"emitter": "worker_process"}) == 1 + # The healthy SSE sub-emitter did not. + assert statsd.counter("galaxy.queue_metrics.error", {"emitter": "sse_connections"}) == 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/test/unit/app/managers/test_sse_dispatch.py b/test/unit/app/managers/test_sse_dispatch.py new file mode 100644 index 000000000000..08da83632ed8 --- /dev/null +++ b/test/unit/app/managers/test_sse_dispatch.py @@ -0,0 +1,226 @@ +"""Unit tests for :class:`galaxy.managers.sse_dispatch.SSEEventDispatcher` observability. + +Focus is on the statsd instrumentation contract AND the effect of dispatch: +counters/timers fire on the happy path and the ``_queue_worker is None`` +early-return, payloads reach the broker with the expected task+kwargs, and the +dispatcher is a silent no-op when ``statsd_client`` is ``None``. + +These tests use lightweight fakes (``FakeStatsdClient``, ``FakeControlTask``) +that record state we can assert against — rather than ``MagicMock`` call-lists — +so a regression that silently drops dispatch or stops recording metrics fails +the test for the right reason. +""" + +from dataclasses import ( + dataclass, + field, +) +from typing import ( + Any, + Optional, +) +from unittest.mock import MagicMock + +import pytest + +from galaxy.managers.sse_dispatch import SSEEventDispatcher + + +@dataclass +class FakeStatsdClient: + """In-memory stand-in for ``VanillaGalaxyStatsdClient``. + + Records ``incr`` and ``timing`` calls as plain data so tests assert on + observable state (counter totals, recorded timings) instead of mock + call-lists. + """ + + counters: dict[tuple[str, tuple[tuple[str, str], ...]], int] = field(default_factory=dict) + timings: list[tuple[str, float, tuple[tuple[str, str], ...]]] = field(default_factory=list) + + def incr(self, metric: str, tags: Optional[dict[str, str]] = None) -> None: + key = (metric, tuple(sorted((tags or {}).items()))) + self.counters[key] = self.counters.get(key, 0) + 1 + + def timing(self, metric: str, value: float, tags: Optional[dict[str, str]] = None) -> None: + self.timings.append((metric, value, tuple(sorted((tags or {}).items())))) + + def counter(self, metric: str, tags: Optional[dict[str, str]] = None) -> int: + return self.counters.get((metric, tuple(sorted((tags or {}).items()))), 0) + + +@dataclass +class RecordedTask: + payload: dict[str, Any] + routing_key: str + expiration: Optional[int] + declare_queues: Any + + +class FakeControlTask: + """Stand-in for ``ControlTask`` that records dispatches instead of touching AMQP.""" + + instances: list["FakeControlTask"] = [] + + def __init__(self, queue_worker) -> None: + self.queue_worker = queue_worker + self.sent: list[RecordedTask] = [] + FakeControlTask.instances.append(self) + + def send_task( + self, + payload: dict[str, Any], + routing_key: str, + expiration: Optional[int] = None, + declare_queues: Any = None, + **_: Any, + ) -> None: + self.sent.append( + RecordedTask( + payload=payload, + routing_key=routing_key, + expiration=expiration, + declare_queues=declare_queues, + ) + ) + + +class BoomControlTask: + """``ControlTask`` fake whose ``send_task`` always raises — exercises the finally block.""" + + def __init__(self, queue_worker) -> None: + self.queue_worker = queue_worker + + def send_task(self, **kwargs) -> None: + raise RuntimeError("broker down") + + +@pytest.fixture +def application_stack(): + return MagicMock(name="ApplicationStack") + + +@pytest.fixture +def queue_worker(): + return MagicMock(name="GalaxyQueueWorker") + + +@pytest.fixture +def statsd() -> FakeStatsdClient: + return FakeStatsdClient() + + +@pytest.fixture(autouse=True) +def _reset_fake_control_task_instances(): + FakeControlTask.instances.clear() + yield + FakeControlTask.instances.clear() + + +@pytest.fixture +def fake_control_task(monkeypatch) -> type[FakeControlTask]: + monkeypatch.setattr("galaxy.managers.sse_dispatch.ControlTask", FakeControlTask) + monkeypatch.setattr( + "galaxy.managers.sse_dispatch.all_control_queues_for_declare", + lambda *args, **kwargs: [], + ) + return FakeControlTask + + +def test_dispatcher_no_op_when_queue_worker_is_none_and_no_statsd(application_stack): + """No statsd client set and no queue_worker → silent no-op, no AttributeError.""" + dispatcher = SSEEventDispatcher( + queue_worker=None, + application_stack=application_stack, + statsd_client=None, + ) + # Must not raise, must not attempt to declare queues. + dispatcher.notify_users([1, 2], "hello") + dispatcher.notify_broadcast("hi") + dispatcher.history_update({"1": [42]}) + + +def test_dispatcher_records_skipped_counter_when_queue_worker_is_none(application_stack, statsd): + """Two dispatches with no queue_worker → two skipped_no_qw increments, no timings.""" + dispatcher = SSEEventDispatcher( + queue_worker=None, + application_stack=application_stack, + statsd_client=statsd, + ) + dispatcher.notify_users([1], "hello") + dispatcher.notify_broadcast("world") + + assert statsd.counter("galaxy.sse.dispatch.skipped_no_qw") == 2 + # No latency timing — we never got as far as the broker call. + assert statsd.timings == [] + + +def test_dispatcher_enqueues_payload_and_records_metrics_on_send( + application_stack, queue_worker, statsd, fake_control_task +): + """Happy path: payload reaches the broker AND counter+timer are recorded.""" + dispatcher = SSEEventDispatcher( + queue_worker=queue_worker, + application_stack=application_stack, + statsd_client=statsd, + ) + dispatcher.notify_users([1, 2], "hello") + + # Exactly one ControlTask constructed, one send_task recorded with the right + # payload — asserts the dispatch *effect*, not just that a mock was called. + assert len(fake_control_task.instances) == 1 + sent = fake_control_task.instances[0].sent + assert len(sent) == 1 + assert sent[0].payload["task"] == "notify_users" + assert sent[0].payload["kwargs"]["user_ids"] == [1, 2] + assert sent[0].payload["kwargs"]["payload"] == "hello" + assert "event_id" in sent[0].payload["kwargs"] + assert sent[0].routing_key == "control.*" + assert sent[0].expiration == 10 + + # Counter + timer both recorded with matching task tag. + assert statsd.counter("galaxy.sse.dispatch.count", {"task": "notify_users"}) == 1 + assert len(statsd.timings) == 1 + metric, _value, tags = statsd.timings[0] + assert metric == "galaxy.sse.dispatch.latency_ms" + assert dict(tags) == {"task": "notify_users"} + + +def test_dispatcher_timer_still_fires_on_send_exception(monkeypatch, application_stack, queue_worker, statsd): + """Timer lives in ``finally`` — broker errors don't mask the latency metric.""" + monkeypatch.setattr( + "galaxy.managers.sse_dispatch.all_control_queues_for_declare", + lambda *args, **kwargs: [], + ) + monkeypatch.setattr("galaxy.managers.sse_dispatch.ControlTask", BoomControlTask) + + dispatcher = SSEEventDispatcher( + queue_worker=queue_worker, + application_stack=application_stack, + statsd_client=statsd, + ) + with pytest.raises(RuntimeError): + dispatcher.history_update({"7": [1]}) + + assert statsd.counter("galaxy.sse.dispatch.count", {"task": "history_update"}) == 1 + assert len(statsd.timings) == 1 + metric, _value, tags = statsd.timings[0] + assert metric == "galaxy.sse.dispatch.latency_ms" + assert dict(tags) == {"task": "history_update"} + + +def test_dispatcher_no_statsd_means_no_instrumentation(application_stack, queue_worker, fake_control_task): + """When ``statsd_client`` is ``None`` instrumentation is bypassed entirely. + + The dispatch still happens — we assert via the ControlTask fake — but there + is nothing to observe on the (absent) statsd side. + """ + dispatcher = SSEEventDispatcher( + queue_worker=queue_worker, + application_stack=application_stack, + statsd_client=None, + ) + dispatcher.notify_broadcast("hi") + assert len(fake_control_task.instances) == 1 + assert len(fake_control_task.instances[0].sent) == 1 + assert fake_control_task.instances[0].sent[0].payload["task"] == "notify_broadcast" diff --git a/test/unit/app/managers/test_sse_dispatch_cache.py b/test/unit/app/managers/test_sse_dispatch_cache.py new file mode 100644 index 000000000000..e7d1a7e9ad42 --- /dev/null +++ b/test/unit/app/managers/test_sse_dispatch_cache.py @@ -0,0 +1,140 @@ +"""Tests for the TTL cache on ``SSEEventDispatcher._get_declare_queues``. + +The cache exists because ``_send`` is on a hot path (1000+ events/s at target +load) and without it each dispatch fires a ``WorkerProcess`` DB query. The +underlying data only changes on a 60 s heartbeat cadence, so a 30 s TTL is safe. +""" + +from concurrent.futures import ( + as_completed, + ThreadPoolExecutor, +) +from typing import Any +from unittest.mock import MagicMock + +import pytest + +from galaxy.managers import sse_dispatch +from galaxy.managers.sse_dispatch import SSEEventDispatcher + + +@pytest.fixture +def fake_declare(monkeypatch): + """Replace ``all_control_queues_for_declare`` with a call-counting fake. + + Returns a non-empty list by default so the cache stores a value — empty + results are intentionally not cached (see ``test_empty_result_not_cached``). + """ + calls: dict[str, Any] = {"count": 0, "returns": [MagicMock(name="queue")]} + + def _fake(application_stack, webapp_only=False): + calls["count"] += 1 + # Sanity: the dispatcher must always ask for webapp-only queues. + assert webapp_only is True + return calls["returns"] + + monkeypatch.setattr(sse_dispatch, "all_control_queues_for_declare", _fake) + return calls + + +class FakeClock: + """Controllable time source passed to ``SSEEventDispatcher``. + + Lets tests advance the dispatcher's TTL cache deterministically without + reaching into ``cachetools`` internals. + """ + + def __init__(self, start: float = 0.0) -> None: + self.now = start + + def __call__(self) -> float: + return self.now + + def advance(self, seconds: float) -> None: + self.now += seconds + + +@pytest.fixture +def clock() -> FakeClock: + return FakeClock() + + +@pytest.fixture +def dispatcher(monkeypatch, clock): + """Build a dispatcher with a stub queue_worker and stub application_stack. + + ``ControlTask`` is swapped for a no-op so ``_send`` doesn't try to open a + real AMQP connection. + """ + queue_worker = MagicMock(name="queue_worker") + application_stack = MagicMock(name="application_stack") + + fake_control_task = MagicMock() + fake_control_task.return_value.send_task = MagicMock() + monkeypatch.setattr(sse_dispatch, "ControlTask", fake_control_task) + + return SSEEventDispatcher(queue_worker=queue_worker, application_stack=application_stack, clock=clock) + + +def test_declare_queues_cached_within_ttl(dispatcher, fake_declare): + """Repeated dispatches inside the TTL window only hit the DB once.""" + for _ in range(10): + dispatcher.notify_broadcast("payload") + assert fake_declare["count"] == 1 + + +def test_declare_queues_refetched_after_ttl(dispatcher, fake_declare, clock): + """Once the TTL expires, the next call refetches exactly once.""" + dispatcher.notify_broadcast("payload") + assert fake_declare["count"] == 1 + + # Advance the injected clock past the TTL so the cache sees the entry as + # expired on the next read. + clock.advance(dispatcher._DECLARE_QUEUES_TTL_SECONDS + 1) + + dispatcher.notify_broadcast("payload") + assert fake_declare["count"] == 2 + + # Further dispatches at the advanced time reuse the newly populated entry. + dispatcher.notify_broadcast("payload") + assert fake_declare["count"] == 2 + + +def test_empty_result_not_cached(dispatcher, fake_declare): + """An empty list must not be pinned in the cache for the full TTL. + + Empty results arise during startup (before ``DatabaseHeartbeat`` writes the + row) and on swallowed DB errors. Caching them would silently drop every SSE + event until the next TTL expiry. + """ + fake_declare["returns"] = [] + dispatcher.notify_broadcast("payload") + dispatcher.notify_broadcast("payload") + assert fake_declare["count"] == 2 + + # Once the upstream starts returning a non-empty result, caching resumes. + fake_declare["returns"] = [MagicMock(name="queue")] + dispatcher.notify_broadcast("payload") + dispatcher.notify_broadcast("payload") + assert fake_declare["count"] == 3 + + +def test_declare_queues_thread_safe_single_query_under_load(dispatcher, fake_declare): + """Concurrent ``_send`` from many threads still only triggers one DB query. + + With stampede protection (RLock around the miss) all 500 dispatches should + collapse to a single ``all_control_queues_for_declare`` call inside one TTL + window. The assertion is exact (== 1), not loose, because the lock + serializes the miss. + """ + iterations = 500 + + def work(): + dispatcher.notify_broadcast("payload") + + with ThreadPoolExecutor(max_workers=16) as pool: + futures = [pool.submit(work) for _ in range(iterations)] + for future in as_completed(futures): + future.result() + + assert fake_declare["count"] == 1 diff --git a/test/unit/app/queue_worker/test_queue_worker.py b/test/unit/app/queue_worker/test_queue_worker.py index 09bc8a32fa54..857bdeb2cac2 100644 --- a/test/unit/app/queue_worker/test_queue_worker.py +++ b/test/unit/app/queue_worker/test_queue_worker.py @@ -1,6 +1,13 @@ import datetime import time +from dataclasses import ( + dataclass, + field, +) from math import inf +from types import SimpleNamespace +from typing import Optional +from unittest.mock import MagicMock import pytest @@ -14,6 +21,27 @@ from galaxy.web_stack import application_stack_instance +@dataclass +class FakeStatsdClient: + """In-memory statsd recorder — see docstring in ``test_sse_dispatch.py``.""" + + counters: dict[tuple[str, tuple[tuple[str, str], ...]], int] = field(default_factory=dict) + timings: list[tuple[str, float, tuple[tuple[str, str], ...]]] = field(default_factory=list) + + def incr(self, metric: str, tags: Optional[dict[str, str]] = None) -> None: + key = (metric, tuple(sorted((tags or {}).items()))) + self.counters[key] = self.counters.get(key, 0) + 1 + + def timing(self, metric: str, value: float, tags: Optional[dict[str, str]] = None) -> None: + self.timings.append((metric, value, tuple(sorted((tags or {}).items())))) + + def counter(self, metric: str, tags: Optional[dict[str, str]] = None) -> int: + return self.counters.get((metric, tuple(sorted((tags or {}).items()))), 0) + + def timings_for(self, metric: str) -> list[tuple[float, dict[str, str]]]: + return [(v, dict(t)) for m, v, t in self.timings if m == metric] + + def bar(app, **kwargs): app.some_var = "bar" app.tasks_executed.append("echo") @@ -119,3 +147,86 @@ def wait_for_var(obj, var, value, tries=10, sleep=0.25): tries -= 1 time.sleep(sleep) assert getattr(obj, var) == value + + +# --------------------------------------------------------------------------- +# process_task observability tests +# +# These don't need a real broker or DB — we just instantiate GalaxyQueueWorker +# via ``__new__`` and drive ``process_task`` directly with fake ``body`` / +# ``message`` objects. The statsd client is pulled through +# ``app.execution_timer_factory.galaxy_statsd_client``. +# --------------------------------------------------------------------------- + + +def _make_fake_worker(task_fn, statsd_client): + worker = GalaxyQueueWorker.__new__(GalaxyQueueWorker) + worker.app = SimpleNamespace( + config=SimpleNamespace(server_name="test.server"), + execution_timer_factory=SimpleNamespace(galaxy_statsd_client=statsd_client), + ) + worker.task_mapping = {"echo": task_fn} + worker.epoch = 0 + # ``producer`` is a read-only property on the mixin; we avoid the publisher + # path entirely by leaving ``reply_to`` out of the fake message properties. + return worker + + +def _fake_message(): + message = MagicMock() + message.headers = {"epoch": inf} # always greater than worker.epoch + message.properties = {} # no reply_to — skip publisher path + return message + + +def test_process_task_emits_counter_and_ok_timer(): + statsd = FakeStatsdClient() + calls: list[dict] = [] + + def handler(app, **kwargs): + calls.append(kwargs) + return "done" + + worker = _make_fake_worker(handler, statsd) + worker.process_task({"task": "echo", "kwargs": {"x": 1}}, _fake_message()) + + # Handler actually ran — if the worker silently dropped the task, this list + # would be empty and the test would fail loudly. + assert calls == [{"x": 1}] + assert statsd.counter("galaxy.control_queue.task.count", {"task": "echo"}) == 1 + assert len(statsd.timings_for("galaxy.control_queue.task.latency_ms")) == 1 + _value, tags = statsd.timings_for("galaxy.control_queue.task.latency_ms")[0] + assert tags == {"task": "echo", "outcome": "ok"} + + +def test_process_task_emits_error_timer_on_handler_exception(): + statsd = FakeStatsdClient() + invocations: list[bool] = [] + + def boom(app, **kwargs): + invocations.append(True) + raise RuntimeError("handler failed") + + worker = _make_fake_worker(boom, statsd) + # process_task swallows handler exceptions (logged, not raised). + worker.process_task({"task": "echo", "kwargs": {}}, _fake_message()) + + # Handler was actually invoked before raising. + assert invocations == [True] + assert statsd.counter("galaxy.control_queue.task.count", {"task": "echo"}) == 1 + assert len(statsd.timings_for("galaxy.control_queue.task.latency_ms")) == 1 + _value, tags = statsd.timings_for("galaxy.control_queue.task.latency_ms")[0] + assert tags == {"task": "echo", "outcome": "error"} + + +def test_process_task_no_statsd_is_silent_no_op(): + calls: list[dict] = [] + + def handler(app, **kwargs): + calls.append(kwargs) + return "done" + + worker = _make_fake_worker(handler, statsd_client=None) + worker.process_task({"task": "echo", "kwargs": {"x": 2}}, _fake_message()) + # Handler ran — the test guards both "no exception" AND "task not silently dropped". + assert calls == [{"x": 2}] From 53f36246ea277f11f2eb72b38e1bae6ee121092a Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Mon, 20 Apr 2026 15:28:03 +0200 Subject: [PATCH 14/47] Expose enable_sse_entry_point_updates through /api/configuration The frontend entryPointStore reads this flag from configStore to pick between SSE and polling. Without the allowlist entry, the key is never serialized to the client, so even when operators set it to true the browser sees undefined and falls back to the polling branch. --- lib/galaxy/managers/configuration.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/galaxy/managers/configuration.py b/lib/galaxy/managers/configuration.py index 9d7647e2fb54..7637520c7287 100644 --- a/lib/galaxy/managers/configuration.py +++ b/lib/galaxy/managers/configuration.py @@ -230,6 +230,7 @@ def _config_is_truthy(item, key, **context): "tool_training_recommendations_api_url": _use_config, "enable_notification_system": _use_config, "enable_sse_history_updates": _use_config, + "enable_sse_entry_point_updates": _use_config, "instance_resource_url": _use_config, "instance_access_url": _use_config, "organization_name": _use_config, From 7384f192b6f5fbfd897dae9f10fd71f146b29553 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 21 Apr 2026 10:26:38 +0200 Subject: [PATCH 15/47] Fix SSE selenium tests for Playwright backend and user identity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three related fixes to make the SSE selenium tests pass end-to-end: - Swap ``self.driver.execute_script`` for ``self.execute_script`` so the tests work with both the Selenium and Playwright backends; the former raises ``NotImplementedError`` under Playwright. - Switch user-info lookup from ``self._get("users/current")`` (API interactor's default key) to ``self.api_get("users/current")`` (browser cookie). ``ensure_registered`` creates a distinct Selenium user, so the API-key user and the browser-SSE user were different and ``push_to_user`` landed in no queue. - Add ``interactivetools_enable`` and ``enable_sse_entry_point_updates`` to the entry-point test config — App.vue won't start watching entry points, and the store won't open an SSE connection, without both. - Use ``self.get(path)`` (goes through ``build_url``) instead of raw URL concatenation, and pass wait timeouts in seconds (not milliseconds); the timeout helper multiplies by 1000 internally. - Target ``.nav-indicator`` on the activity-bar item rather than a ``.indicator`` class that the bell component no longer renders. --- .../test_entry_point_sse.py | 15 ++++++++-- .../test_notification_sse.py | 29 ++++++++++++------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/test/integration_selenium/test_entry_point_sse.py b/test/integration_selenium/test_entry_point_sse.py index e661cb0a79fb..e579c70a3abe 100644 --- a/test/integration_selenium/test_entry_point_sse.py +++ b/test/integration_selenium/test_entry_point_sse.py @@ -30,17 +30,23 @@ class TestEntryPointSSESeleniumIntegration(SeleniumIntegrationTestCase): def handle_galaxy_config_kwds(cls, config): super().handle_galaxy_config_kwds(config) config["enable_celery_tasks"] = False + # App.vue only calls entryPointStore.startWatchingEntryPoints() when + # interactivetools_enable is True, and the store only opens an SSE + # connection when enable_sse_entry_point_updates is True. Without both, + # __galaxy_sse_connected never becomes true and the gate below times out. + config["interactivetools_enable"] = True + config["enable_sse_entry_point_updates"] = True def _wait_for_sse_connected(self) -> None: """Block until the frontend confirms the SSE pipeline is live.""" wait_on( - lambda: True if self.driver.execute_script("return window.__galaxy_sse_connected === true") else None, + lambda: True if self.execute_script("return window.__galaxy_sse_connected === true") else None, "window.__galaxy_sse_connected === true", timeout=SSE_CONNECT_TIMEOUT_SECONDS, ) def _last_sse_event_ts(self) -> int: - return self.driver.execute_script("return window.__galaxy_sse_last_event_ts || 0") or 0 + return self.execute_script("return window.__galaxy_sse_last_event_ts || 0") or 0 def _wait_for_sse_event_after(self, baseline_ts: int) -> None: wait_on( @@ -55,7 +61,10 @@ def _create_it_job_with_entry_point(self, tool_port: int = 8888) -> tuple[int, i Job, ) - user_info = self._get("users/current").json() + # Use the browser's cookie-authenticated user, not the API interactor's + # default: SSE connects under the Selenium-registered user, and the + # dispatch's user_id must match or push_to_user finds no queues. + user_info = self.api_get("users/current") user_id = self._app.security.decode_id(user_info["id"]) sa_session = self._app.model.context job = Job() diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index cae3b6e7b6ba..d1902d2b1beb 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -35,14 +35,14 @@ def _wait_for_sse_connected(self) -> None: test would falsely pass. """ wait_on( - lambda: True if self.driver.execute_script("return window.__galaxy_sse_connected === true") else None, + lambda: True if self.execute_script("return window.__galaxy_sse_connected === true") else None, "window.__galaxy_sse_connected === true", timeout=SSE_CONNECT_TIMEOUT_SECONDS, ) def _last_sse_event_ts(self) -> int: """Return the last SSE event timestamp recorded by the composable, or 0.""" - return self.driver.execute_script("return window.__galaxy_sse_last_event_ts || 0") or 0 + return self.execute_script("return window.__galaxy_sse_last_event_ts || 0") or 0 def _wait_for_sse_event_after(self, baseline_ts: int) -> None: """Block until an SSE event arrives after ``baseline_ts``. @@ -61,12 +61,18 @@ def _wait_for_sse_event_after(self, baseline_ts: int) -> None: @managed_history def test_notification_appears_via_sse(self): """Send a notification via the API and verify it appears in the UI without refresh.""" - # Get the logged-in user's info so we can send a notification to them - user_info = self._get("users/current").json() + # Get the browser-logged-in user's info via the browser's cookie. ``self._get`` + # uses the API interactor's default-user key, which does not match the Selenium + # user created by ``ensure_registered``, so the SSE push would target a + # different user than the one watching the stream. + user_info = self.api_get("users/current") user_id = user_info["id"] - # Navigate to notifications page so the store is watching - self.driver.get(f"{self.target_url_from_selenium}/user/notifications") + # Navigate to notifications page so the store is watching. + # ``get()`` uses ``build_url()`` which handles trailing slashes on the + # base correctly; concatenating against ``target_url_from_selenium`` + # can produce a double-slash that Galaxy routes differently. + self.get("user/notifications") self._wait_for_sse_connected() baseline_ts = self._last_sse_event_ts() self.screenshot("notification_sse_before") @@ -94,14 +100,16 @@ def test_notification_appears_via_sse(self): # the UI still shows the notification, polling picked it up — a silent # regression this assertion catches. self._wait_for_sse_event_after(baseline_ts) - self.wait_for_selector_visible(f"text={subject}", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) + self.wait_for_selector_visible(f"text={subject}", timeout=SSE_EVENT_TIMEOUT_SECONDS) self.screenshot("notification_sse_after") @selenium_test @managed_history def test_notification_bell_updates_via_sse(self): """The notification bell indicator should update when a new notification arrives via SSE.""" - user_info = self._get("users/current").json() + # See ``test_notification_appears_via_sse`` — must use the browser's user, + # not the API interactor's default user. + user_info = self.api_get("users/current") user_id = user_info["id"] # Go to home page (bell is in masthead) @@ -128,6 +136,7 @@ def test_notification_bell_updates_via_sse(self): self._assert_status_code_is_ok(response) self._wait_for_sse_event_after(baseline_ts) - # The indicator dot should appear on the bell (within the #activity-notifications element) - self.wait_for_selector_visible("#activity-notifications .indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS * 1000) + # The activity-bar notifications item renders its unread-count badge as + # ``.nav-indicator`` (see ``ActivityItem.vue``) once ``totalUnreadCount > 0``. + self.wait_for_selector_visible("#activity-notifications .nav-indicator", timeout=SSE_EVENT_TIMEOUT_SECONDS) self.screenshot("notification_bell_indicator") From da97ee61917284b299e3f865de2a85b2b5630e5b Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Tue, 21 Apr 2026 10:33:39 +0200 Subject: [PATCH 16/47] Add types-cachetools to typecheck deps --- lib/galaxy/dependencies/pinned-typecheck-requirements.txt | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/lib/galaxy/dependencies/pinned-typecheck-requirements.txt b/lib/galaxy/dependencies/pinned-typecheck-requirements.txt index f0cb63fba34f..cced7deee47c 100644 --- a/lib/galaxy/dependencies/pinned-typecheck-requirements.txt +++ b/lib/galaxy/dependencies/pinned-typecheck-requirements.txt @@ -18,6 +18,7 @@ tomli==2.4.1 ; python_full_version < '3.11' types-awscrt==0.31.3 types-bleach==6.3.0.20260408 types-boto==2.49.18.20241019 +types-cachetools==6.2.0.20260408 types-contextvars==2.4.7.3 types-dataclasses==0.6.6 types-docutils==0.22.3.20260408 diff --git a/pyproject.toml b/pyproject.toml index ee68a0fc1c05..c3ec9ed16cf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -176,6 +176,7 @@ typecheck = [ "pydantic>=2.7.4", # for pydantic.mypy plugin "types-bleach", "types-boto", + "types-cachetools", "types-contextvars", "types-dataclasses", "types-docutils", From f053d80bad78a45170b99e1f49c2eedadf4b5a36 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 23 Apr 2026 12:43:16 +0200 Subject: [PATCH 17/47] Swap Playwright text= selector for xpath in notification SSE test wait_for_selector_visible dispatches as a CSS selector on both backends; `text=` is a Playwright engine prefix that Selenium's css-selector path rejects with InvalidSelectorException. wait_for_xpath_visible works on both backends. --- test/integration_selenium/test_notification_sse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index d1902d2b1beb..7e2fcf15e711 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -100,7 +100,11 @@ def test_notification_appears_via_sse(self): # the UI still shows the notification, polling picked it up — a silent # regression this assertion catches. self._wait_for_sse_event_after(baseline_ts) - self.wait_for_selector_visible(f"text={subject}", timeout=SSE_EVENT_TIMEOUT_SECONDS) + # Use xpath (works on both Selenium and Playwright backends); `text=` is a + # Playwright engine prefix that Selenium's css-selector path rejects. + self.wait_for_xpath_visible( + f'//*[contains(text(), "{subject}")]', timeout=SSE_EVENT_TIMEOUT_SECONDS + ) self.screenshot("notification_sse_after") @selenium_test From f8cb87ac6e8aa6d3b764ef22072289280fd91fea Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 23 Apr 2026 13:05:30 +0200 Subject: [PATCH 18/47] Apply black formatting to SSE test --- test/integration_selenium/test_notification_sse.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/integration_selenium/test_notification_sse.py b/test/integration_selenium/test_notification_sse.py index 7e2fcf15e711..8e787599fbe1 100644 --- a/test/integration_selenium/test_notification_sse.py +++ b/test/integration_selenium/test_notification_sse.py @@ -102,9 +102,7 @@ def test_notification_appears_via_sse(self): self._wait_for_sse_event_after(baseline_ts) # Use xpath (works on both Selenium and Playwright backends); `text=` is a # Playwright engine prefix that Selenium's css-selector path rejects. - self.wait_for_xpath_visible( - f'//*[contains(text(), "{subject}")]', timeout=SSE_EVENT_TIMEOUT_SECONDS - ) + self.wait_for_xpath_visible(f'//*[contains(text(), "{subject}")]', timeout=SSE_EVENT_TIMEOUT_SECONDS) self.screenshot("notification_sse_after") @selenium_test From cc289e93f181e2118ac83e734738f97b63e8d0d5 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 23 Apr 2026 13:51:23 +0200 Subject: [PATCH 19/47] TEMP: enable notification/SSE flags across all UI test workflows Flip enable_notification_system, enable_sse_history_updates, and enable_sse_entry_point_updates in the Selenium, Playwright, and Integration Selenium workflows so the full UI test surface exercises the SSE pipeline. Revert before merging. --- .github/workflows/integration_selenium.yaml | 4 ++++ .github/workflows/playwright.yaml | 4 ++++ .github/workflows/selenium.yaml | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/.github/workflows/integration_selenium.yaml b/.github/workflows/integration_selenium.yaml index c6ca7e437397..88212edd124e 100644 --- a/.github/workflows/integration_selenium.yaml +++ b/.github/workflows/integration_selenium.yaml @@ -19,6 +19,10 @@ env: YARN_INSTALL_OPTS: --frozen-lockfile GALAXY_CONFIG_SQLALCHEMY_WARN_20: '1' GALAXY_DEPENDENCIES_INSTALL_WEASYPRINT: '1' + # TEMP: shake down SSE/notification system across full UI surface — revert before merge + GALAXY_CONFIG_OVERRIDE_ENABLE_NOTIFICATION_SYSTEM: '1' + GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_HISTORY_UPDATES: '1' + GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_ENTRY_POINT_UPDATES: '1' concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true diff --git a/.github/workflows/playwright.yaml b/.github/workflows/playwright.yaml index 163c83b38fd3..44fddab54988 100644 --- a/.github/workflows/playwright.yaml +++ b/.github/workflows/playwright.yaml @@ -20,6 +20,10 @@ env: GALAXY_TEST_SELENIUM_HEADLESS: 1 YARN_INSTALL_OPTS: --frozen-lockfile GALAXY_CONFIG_SQLALCHEMY_WARN_20: '1' + # TEMP: shake down SSE/notification system across full UI surface — revert before merge + GALAXY_CONFIG_OVERRIDE_ENABLE_NOTIFICATION_SYSTEM: '1' + GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_HISTORY_UPDATES: '1' + GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_ENTRY_POINT_UPDATES: '1' concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true diff --git a/.github/workflows/selenium.yaml b/.github/workflows/selenium.yaml index 60003fdf51ea..7f5c8490872d 100644 --- a/.github/workflows/selenium.yaml +++ b/.github/workflows/selenium.yaml @@ -19,6 +19,10 @@ env: GALAXY_TEST_SKIP_FLAKEY_TESTS_ON_ERROR: 1 YARN_INSTALL_OPTS: --frozen-lockfile GALAXY_CONFIG_SQLALCHEMY_WARN_20: '1' + # TEMP: shake down SSE/notification system across full UI surface — revert before merge + GALAXY_CONFIG_OVERRIDE_ENABLE_NOTIFICATION_SYSTEM: '1' + GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_HISTORY_UPDATES: '1' + GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_ENTRY_POINT_UPDATES: '1' concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true From 4b90e4facf21462a247476d6cc3c76f2610825d2 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 23 Apr 2026 15:36:44 +0200 Subject: [PATCH 20/47] Route history_update SSE events for anonymous sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HistoryAuditMonitor only dispatched history_update events via push_to_user, so anonymous-owned histories (user_id IS NULL) never produced events. With enable_sse_history_updates on, the client disables polling and waits for SSE events — leaving anonymous history panels frozen, which broke seven UI tests across the Playwright suites. Extend the pipeline with a parallel galaxy_session.id-keyed route: - SSEConnectionManager tracks a _session_connections map alongside _connections and exposes push_to_session; connect/disconnect/stream accept an optional galaxy_session_id. - EventsService.open_stream forwards trans.galaxy_session.id so anonymous sessions register under their session key. - SSEEventDispatcher.history_update and HistoryUpdatePayload gain an optional session_updates dict; the queue_worker handler fans out session-keyed events via push_to_session. - HistoryAuditMonitor caches (user_id, session_ids) per history and performs one extra indexed lookup against GalaxySessionToHistoryAssociation only for anon histories. galaxy_session.id never leaves the server — it's used only as an in-memory/AMQP dispatch key; the browser-visible event payload still contains just encoded history_ids. --- lib/galaxy/managers/history_audit_monitor.py | 58 ++++++++++++++++---- lib/galaxy/managers/sse.py | 40 ++++++++++++-- lib/galaxy/managers/sse_dispatch.py | 23 +++++--- lib/galaxy/queue_worker/__init__.py | 16 +++++- lib/galaxy/webapps/galaxy/services/events.py | 10 +++- 5 files changed, 119 insertions(+), 28 deletions(-) diff --git a/lib/galaxy/managers/history_audit_monitor.py b/lib/galaxy/managers/history_audit_monitor.py index ec76cf94a2fe..c3291dd2e843 100644 --- a/lib/galaxy/managers/history_audit_monitor.py +++ b/lib/galaxy/managers/history_audit_monitor.py @@ -31,6 +31,7 @@ from galaxy.config import GalaxyAppConfiguration from galaxy.managers.sse_dispatch import SSEEventDispatcher from galaxy.model import ( + GalaxySessionToHistoryAssociation, History, HistoryAudit, ) @@ -119,8 +120,11 @@ def __init__( self._exit = threading.Event() self._thread: Optional[threading.Thread] = None self._active = False - # Bounded LRU cache: history_id -> user_id, refreshed on miss. - self._history_owner_cache: OrderedDict[int, int] = OrderedDict() + # Bounded LRU cache: history_id -> (user_id, session_ids), refreshed on miss. + # For registered-owned histories: (user_id, ()); for anonymous histories: + # (None, (session_id, ...)) — a history can be associated with multiple + # sessions via GalaxySessionToHistoryAssociation. + self._history_owner_cache: OrderedDict[int, tuple[Optional[int], tuple[int, ...]]] = OrderedDict() def start(self) -> None: if self._active: @@ -227,7 +231,7 @@ def _poll_audit_table(self) -> None: # --- Common dispatch logic --- def _dispatch_history_updates(self, history_ids: set[int]) -> None: - """Map history_ids to user_ids and send Kombu control task. + """Map history_ids to user_ids / session_ids and send Kombu control task. Raw integer history IDs are sent across the control queue; encoding is deferred to the ``history_update`` task handler on the receiving side, @@ -239,24 +243,58 @@ def _dispatch_history_updates(self, history_ids: set[int]) -> None: self._refresh_owner_cache(unknown) user_updates: dict[str, list[int]] = defaultdict(list) + session_updates: dict[str, list[int]] = defaultdict(list) for history_id in history_ids: - user_id = self._history_owner_cache.get(history_id) + entry = self._history_owner_cache.get(history_id) + if entry is None: + continue + user_id, session_ids = entry if user_id is not None: user_updates[str(user_id)].append(history_id) + else: + for session_id in session_ids: + session_updates[str(session_id)].append(history_id) - if not user_updates: + if not user_updates and not session_updates: return - self._dispatcher.history_update(user_updates=dict(user_updates)) + self._dispatcher.history_update( + user_updates=dict(user_updates), + session_updates=dict(session_updates) if session_updates else None, + ) def _refresh_owner_cache(self, history_ids: set[int]) -> None: - """Look up user_id for given history_ids and update the bounded cache.""" + """Look up ownership for given history_ids and update the bounded cache. + + Registered-owned histories resolve with just ``History.user_id``. For + histories where ``user_id IS NULL`` we additionally fetch associated + ``galaxy_session.id`` values from ``GalaxySessionToHistoryAssociation`` + so the anonymous SSE dispatch path can target the right browser. + """ try: - stmt = sa_select(History.id, History.user_id).where(History.id.in_(history_ids)) with self._model.new_session() as session: + stmt = sa_select(History.id, History.user_id).where(History.id.in_(history_ids)) + anon_history_ids: set[int] = set() for row in session.execute(stmt): - self._history_owner_cache[row[0]] = row[1] - self._history_owner_cache.move_to_end(row[0]) + hid, uid = row[0], row[1] + self._history_owner_cache[hid] = (uid, ()) + self._history_owner_cache.move_to_end(hid) + if uid is None: + anon_history_ids.add(hid) + + if anon_history_ids: + assoc_stmt = sa_select( + GalaxySessionToHistoryAssociation.history_id, + GalaxySessionToHistoryAssociation.session_id, + ).where(GalaxySessionToHistoryAssociation.history_id.in_(anon_history_ids)) + sessions_by_history: dict[int, list[int]] = defaultdict(list) + for row in session.execute(assoc_stmt): + hid, sid = row[0], row[1] + if sid is not None: + sessions_by_history[hid].append(sid) + for hid, sids in sessions_by_history.items(): + self._history_owner_cache[hid] = (None, tuple(sids)) + while len(self._history_owner_cache) > OWNER_CACHE_MAX: self._history_owner_cache.popitem(last=False) except Exception: diff --git a/lib/galaxy/managers/sse.py b/lib/galaxy/managers/sse.py index 66b54bb002d5..0284b1786b6b 100644 --- a/lib/galaxy/managers/sse.py +++ b/lib/galaxy/managers/sse.py @@ -81,6 +81,7 @@ class SSEConnectionManager: def __init__(self, statsd_client: Optional[VanillaGalaxyStatsdClient] = None) -> None: self._connections: dict[int, set[asyncio.Queue]] = defaultdict(set) + self._session_connections: dict[int, set[asyncio.Queue]] = defaultdict(set) self._broadcast_connections: set[asyncio.Queue] = set() self._loop: Optional[asyncio.AbstractEventLoop] = None self._statsd_client = statsd_client @@ -92,27 +93,39 @@ def _ensure_loop(self) -> None: # -- Called from ASYNC context (uvicorn event loop thread) -- - def connect(self, user_id: Optional[int]) -> asyncio.Queue: + def connect(self, user_id: Optional[int], galaxy_session_id: Optional[int] = None) -> asyncio.Queue: """Register a new SSE connection. Returns a queue to await events from. Called from the SSE endpoint handler (async context). A ``ready`` event is enqueued immediately so that clients (and tests) can synchronize on the server-side subscription rather than the underlying socket open event. + + ``galaxy_session_id`` is the dispatch key for events that target a + specific browser session (e.g. history updates for anonymous users, + whose ``user_id`` is ``None``). """ self._ensure_loop() queue: asyncio.Queue = asyncio.Queue(maxsize=64) if user_id is not None: self._connections[user_id].add(queue) + if galaxy_session_id is not None: + self._session_connections[galaxy_session_id].add(queue) self._broadcast_connections.add(queue) queue.put_nowait(SSEEvent(event="ready", data="")) log.debug( - "SSE connection opened for user_id=%s (total=%d)", + "SSE connection opened for user_id=%s session_id=%s (total=%d)", user_id, + galaxy_session_id, len(self._broadcast_connections), ) return queue - def disconnect(self, user_id: Optional[int], queue: asyncio.Queue) -> None: + def disconnect( + self, + user_id: Optional[int], + queue: asyncio.Queue, + galaxy_session_id: Optional[int] = None, + ) -> None: """Unregister an SSE connection. Called from the SSE endpoint's ``finally`` block (async context). @@ -121,10 +134,15 @@ def disconnect(self, user_id: Optional[int], queue: asyncio.Queue) -> None: self._connections[user_id].discard(queue) if not self._connections[user_id]: del self._connections[user_id] + if galaxy_session_id is not None: + self._session_connections[galaxy_session_id].discard(queue) + if not self._session_connections[galaxy_session_id]: + del self._session_connections[galaxy_session_id] self._broadcast_connections.discard(queue) log.debug( - "SSE connection closed for user_id=%s (total=%d)", + "SSE connection closed for user_id=%s session_id=%s (total=%d)", user_id, + galaxy_session_id, len(self._broadcast_connections), ) @@ -135,6 +153,15 @@ def push_to_user(self, user_id: int, event: SSEEvent) -> None: for queue in list(self._connections.get(user_id, [])): self._safe_put(queue, event) + def push_to_session(self, galaxy_session_id: int, event: SSEEvent) -> None: + """Thread-safe. Push an event to all SSE connections for a specific galaxy_session. + + Used to route per-browser events (e.g. history updates for anonymous + histories) when there is no registered ``user_id`` to key on. + """ + for queue in list(self._session_connections.get(galaxy_session_id, [])): + self._safe_put(queue, event) + def push_broadcast(self, event: SSEEvent) -> None: """Thread-safe. Push an event to ALL connected SSE clients.""" for queue in list(self._broadcast_connections): @@ -189,6 +216,7 @@ async def stream( user_id: Optional[int], catch_up: Optional[SSEEvent] = None, keepalive: float = 30.0, + galaxy_session_id: Optional[int] = None, ) -> AsyncIterator[str]: """Yield SSE-framed strings for one connected client. @@ -198,7 +226,7 @@ async def stream( what the service passes in (typically ``request.is_disconnected`` from starlette) so the manager stays framework-agnostic. """ - queue = self.connect(user_id) + queue = self.connect(user_id, galaxy_session_id) if catch_up is not None: await queue.put(catch_up) try: @@ -211,4 +239,4 @@ async def stream( except asyncio.TimeoutError: yield ": keepalive\n\n" finally: - self.disconnect(user_id, queue) + self.disconnect(user_id, queue, galaxy_session_id) diff --git a/lib/galaxy/managers/sse_dispatch.py b/lib/galaxy/managers/sse_dispatch.py index 927cb1ab0d81..fc9b6e84a087 100644 --- a/lib/galaxy/managers/sse_dispatch.py +++ b/lib/galaxy/managers/sse_dispatch.py @@ -123,14 +123,21 @@ def notify_broadcast(self, payload: str, event_id: Optional[str] = None) -> None }, ) - def history_update(self, user_updates: dict[str, list[int]], event_id: Optional[str] = None) -> None: - self._send( - "history_update", - { - "user_updates": user_updates, - "event_id": event_id or make_event_id(), - }, - ) + def history_update( + self, + user_updates: dict[str, list[int]], + event_id: Optional[str] = None, + session_updates: Optional[dict[str, list[int]]] = None, + ) -> None: + kwargs: dict[str, Any] = { + "user_updates": user_updates, + "event_id": event_id or make_event_id(), + } + if session_updates: + # Only include when non-empty: anonymous histories are uncommon on + # most deployments, and an empty dict is wasted wire payload. + kwargs["session_updates"] = session_updates + self._send("history_update", kwargs) def entry_point_update(self, user_id: int, event_id: Optional[str] = None) -> None: """Fan out a wake-up ``entry_point_update`` event for one user. diff --git a/lib/galaxy/queue_worker/__init__.py b/lib/galaxy/queue_worker/__init__.py index a01fc27efc91..3d5684634ca2 100644 --- a/lib/galaxy/queue_worker/__init__.py +++ b/lib/galaxy/queue_worker/__init__.py @@ -69,10 +69,14 @@ class HistoryUpdatePayload(TypedDict, total=False): """Wire contract for the ``history_update`` control-task kwargs. ``user_updates`` maps stringified user IDs to lists of (unencoded) history IDs. - Stringified because AMQP JSON serialization coerces dict keys to strings. + ``session_updates`` is the parallel route for anonymous-owned histories, + keyed by stringified ``galaxy_session.id`` (the dispatch key never leaves + the server — browsers never see it). Stringified because AMQP JSON + serialization coerces dict keys to strings. """ user_updates: dict[str, list[int]] + session_updates: dict[str, list[int]] event_id: Optional[str] @@ -423,7 +427,9 @@ def history_update(app: "MinimalManagerApp", **kwargs) -> None: """Push SSE history update events to connected users on this worker process. Encodes integer history IDs here (not in the monitor) so the manager layer - stays free of presentation/security concerns. + stays free of presentation/security concerns. Handles both user-keyed + routing (registered users) and galaxy_session-keyed routing (anonymous + histories, which have ``user_id IS NULL``). """ payload = cast(HistoryUpdatePayload, kwargs) sse_manager = app[SSEConnectionManager] @@ -435,6 +441,12 @@ def history_update(app: "MinimalManagerApp", **kwargs) -> None: data = json.dumps({"history_ids": encoded_ids}) event = SSEEvent(event="history_update", data=data, id=event_id) sse_manager.push_to_user(user_id, event) + for session_id_str, history_ids in payload.get("session_updates", {}).items(): + session_id = int(session_id_str) + encoded_ids = [encode(hid) for hid in history_ids] + data = json.dumps({"history_ids": encoded_ids}) + event = SSEEvent(event="history_update", data=data, id=event_id) + sse_manager.push_to_session(session_id, event) def entry_point_update(app: "MinimalManagerApp", **kwargs) -> None: diff --git a/lib/galaxy/webapps/galaxy/services/events.py b/lib/galaxy/webapps/galaxy/services/events.py index aea3998ac25e..0eda3ae7a0f9 100644 --- a/lib/galaxy/webapps/galaxy/services/events.py +++ b/lib/galaxy/webapps/galaxy/services/events.py @@ -32,7 +32,13 @@ def open_stream( last_event_id: Optional[str], is_disconnected: IsDisconnected, ) -> AsyncIterator[str]: - """Open an SSE events stream; anonymous users receive only broadcasts.""" + """Open an SSE events stream. + + Anonymous users still register under their ``galaxy_session.id`` so the + server can route per-session events (e.g. ``history_update`` for + anonymous-owned histories) even when ``user_id`` is ``None``. + """ user_id = user_context.user.id if not user_context.anonymous else None + session_id = user_context.galaxy_session.id if user_context.galaxy_session else None catch_up = self.notifications.build_status_catchup(user_context, last_event_id) - return self.sse_manager.stream(is_disconnected, user_id, catch_up=catch_up) + return self.sse_manager.stream(is_disconnected, user_id, catch_up=catch_up, galaxy_session_id=session_id) From daba09653b96473912274bd3f3edb6c253e2a2a5 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 23 Apr 2026 22:05:50 +0200 Subject: [PATCH 21/47] Close EventSource on pagehide so full-page navigation doesn't race the socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Form-based login / registration / logout all navigate via ``window.location.href``. Chrome doesn't guarantee that an open ``EventSource`` (or two: history + notifications) is torn down before the navigation issues requests for the new page — in selenium we've seen the new page load with the stale anonymous cookie because the server still had the old stream's trans in flight when ``GET /`` landed. Symptom: masthead stays on Login/Register and ``/api/users/current`` returns the anonymous quota even though ``POST /user/create`` returned 200. Adding a ``pagehide`` listener inside ``useSSE`` forces ``eventSource.close()`` synchronously right before navigation, closing the race. ``pagehide`` is preferred over ``beforeunload`` because it fires for back-forward-cache restores too and can't be cancelled by other handlers. Reproduced locally with ``GALAXY_CONFIG_OVERRIDE_ENABLE_NOTIFICATION_SYSTEM=1 GALAXY_CONFIG_OVERRIDE_ENABLE_SSE_HISTORY_UPDATES=1 ./run_tests.sh -selenium lib/galaxy_test/selenium/test_history_sharing.py::TestHistorySharing::test_unsharing`` — 3/3 failures before this change, 3/3 passes after. --- client/src/composables/useNotificationSSE.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/client/src/composables/useNotificationSSE.ts b/client/src/composables/useNotificationSSE.ts index 944af5fb8d6b..56cf861a4a6a 100644 --- a/client/src/composables/useNotificationSSE.ts +++ b/client/src/composables/useNotificationSSE.ts @@ -45,6 +45,17 @@ export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: reado onEvent(event); }; + // Browser EventSource teardown during a full-page navigation + // (``window.location.href = …``) is not guaranteed to happen before the + // browser issues requests for the new page — we've seen Chrome keep the + // stream alive long enough that a login/register POST reload races the + // close, and the new page then loads with a stale auth view. Force a + // synchronous ``eventSource.close()`` during ``pagehide`` (fires for both + // reloads and tab-close, unlike ``beforeunload``) to close that window. + // The listener is registered only while a connection is live so composables + // that never ``connect()`` don't leave dangling listeners behind. + const onPageHide = () => disconnect(); + function connect() { disconnect(); const url = withPrefix("/api/events/stream"); @@ -69,6 +80,10 @@ export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: reado connected.value = false; sseGlobals().__galaxy_sse_connected = false; }; + + if (typeof window !== "undefined") { + window.addEventListener("pagehide", onPageHide); + } } function disconnect() { @@ -79,6 +94,9 @@ export function useSSE(onEvent: (event: MessageEvent) => void, eventTypes: reado eventSource.close(); eventSource = null; } + if (typeof window !== "undefined") { + window.removeEventListener("pagehide", onPageHide); + } connected.value = false; sseGlobals().__galaxy_sse_connected = false; } From 01c714add5ba525434c37f9b9a1d9b057a4a71a9 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 23 Apr 2026 22:54:35 +0200 Subject: [PATCH 22/47] Guarantee playwright.stop() runs even when browser.close() raises MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Galaxy's Playwright driver teardown was ``browser.close()`` then ``playwright.stop()``. If ``browser.close()`` raised — in CI we've seen it hit target-detached errors or stall long enough to time out — the exception escaped before ``stop()`` ran, so the per-instance asyncio loop stayed flagged as "running" on the main test thread. Every subsequent test's ``sync_playwright().__enter__`` then rejected with "Playwright Sync API inside the asyncio loop" and the entire shard cascaded into 80+ ERRORs at setup. Wrap ``close()`` in ``try/finally`` and swallow + log any ``stop()`` exception so one test's bad teardown can't poison the next test's driver init. --- lib/galaxy/selenium/has_playwright_driver.py | 21 ++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/galaxy/selenium/has_playwright_driver.py b/lib/galaxy/selenium/has_playwright_driver.py index d2cf3aa6aead..93235d763ec8 100644 --- a/lib/galaxy/selenium/has_playwright_driver.py +++ b/lib/galaxy/selenium/has_playwright_driver.py @@ -112,6 +112,7 @@ def timeout_for(self, **kwds): """ import abc +import logging from contextlib import contextmanager from typing import ( Any, @@ -157,6 +158,8 @@ def timeout_for(self, **kwds): from .wait_methods_mixin import WaitMethodsMixin from .web_element_protocol import WebElementProtocol +logger = logging.getLogger(__name__) + UNSPECIFIED_TIMEOUT = object() @@ -1153,9 +1156,23 @@ def quit(self) -> None: This closes all windows/tabs and releases all system resources. The driver cannot be used after calling this method. + + ``browser.close()`` can raise — in CI we've seen it time out or hit + target-detached errors — and if the exception escapes before + ``playwright.stop()`` runs, the per-instance asyncio loop is left + registered as "running" on the main thread. Every subsequent test's + ``sync_playwright().__enter__`` then refuses to start with "Playwright + Sync API inside the asyncio loop", cascading the whole shard into + errors. Always tear down the Playwright instance even if the browser + close failed. """ - self.close() - self._playwright_resources.playwright.stop() + try: + self.close() + finally: + try: + self._playwright_resources.playwright.stop() + except Exception: + logger.exception("Error stopping Playwright instance during quit()") __all__ = ( From 1ae724f40b2ef776b8e8c52cf08c815c5a9b9933 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 24 Apr 2026 09:08:13 +0200 Subject: [PATCH 23/47] Cancel background traffic before login/register so session cookie isn't clobbered When handle_user_login invalidates the previous anonymous session and a concurrent request using the old cookie is still in flight, the server creates a *new* anonymous session for it and responds with a fresh `Set-Cookie: galaxysession=`. If that response lands between the login POST and the full-page navigation, the browser navigates with the anonymous cookie and the new page loads logged out. Under the TEMP SSE flag this happens often enough to trip `wait_for_logged_in` in selenium. Fix: synchronously close all long-lived connections (SSE, polling watchers) and rotate a shared axios AbortController before sending the login/register POST. With no in-flight anonymous-cookie request, the server can't emit the clobbering Set-Cookie, and the authenticated cookie survives until navigation. --- client/src/stores/entryPointStore.ts | 28 +++++++++++++++------- client/src/stores/historyStore.ts | 31 ++++++++++++++++++------- client/src/stores/notificationsStore.ts | 16 +++++++++---- 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/client/src/stores/entryPointStore.ts b/client/src/stores/entryPointStore.ts index 68193406967f..f9f0e7d98c78 100644 --- a/client/src/stores/entryPointStore.ts +++ b/client/src/stores/entryPointStore.ts @@ -48,9 +48,10 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { disconnect: sseDisconnect, connected: sseConnected, } = useSSE(handleEntryPointSSEEvent, ["entry_point_update"]); + let stopPolling: (() => void) | null = null; + let stopConnectedWatcher: (() => void) | null = null; let watchingInitialized = false; - let stopWatchingEntryPointsResource: (() => void) | null = null; // Callers opt in via ``startWatchingEntryPoints()`` (App.vue gates this on // ``interactivetools_enable``). We then pick SSE or polling based on the @@ -70,7 +71,7 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { // navigated away and missed events" window. fetchEntryPoints().catch((err) => console.warn("Initial entry-point load failed", err)); sseConnect(); - watch(sseConnected, (isConnected, wasConnected) => { + stopConnectedWatcher = watch(sseConnected, (isConnected, wasConnected) => { if (isConnected && !wasConnected) { fetchEntryPoints().catch((err) => console.error("Error refreshing entry points on SSE reconnect:", err), @@ -82,7 +83,7 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { shortPollingInterval: ACTIVE_POLLING_INTERVAL, enableBackgroundPolling: false, }); - stopWatchingEntryPointsResource = stopWatchingResource; + stopPolling = stopWatchingResource; startWatchingResource(); } }; @@ -136,11 +137,6 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { return { ...original, ...updated }; } - function stopWatchingEntryPoints() { - sseDisconnect(); - stopWatchingEntryPointsResource?.(); - } - function removeEntryPoint(toolId: string) { const index = entryPoints.value.findIndex((ep) => { return ep.id === toolId ? true : false; @@ -150,6 +146,22 @@ export const useEntryPointStore = defineStore("entryPointStore", () => { } } + // Closes the SSE stream and stops the polling watcher; paired with login + // /register flows so background traffic doesn't outlive the navigation + // and clobber the freshly authenticated session cookie. + function stopWatchingEntryPoints() { + sseDisconnect(); + if (stopPolling) { + stopPolling(); + stopPolling = null; + } + if (stopConnectedWatcher) { + stopConnectedWatcher(); + stopConnectedWatcher = null; + } + watchingInitialized = false; + } + return { entryPoints, entryPointsForJob, diff --git a/client/src/stores/historyStore.ts b/client/src/stores/historyStore.ts index ec2d91a0728b..a5086771c780 100644 --- a/client/src/stores/historyStore.ts +++ b/client/src/stores/historyStore.ts @@ -401,6 +401,8 @@ export const useHistoryStore = defineStore("historyStore", () => { handleHistorySSEEvent, SSE_HISTORY_EVENT_TYPES, ); + let stopHistoryPolling: (() => void) | null = null; + let stopIsWatchingWatcher: (() => void) | null = null; function handleHistorySSEEvent(event: MessageEvent) { try { @@ -433,7 +435,6 @@ export const useHistoryStore = defineStore("historyStore", () => { // polling we explicitly don't want. const isWatchingHistory = ref(false); let watchingInitialized = false; - let stopWatchingHistoryResource: (() => void) | null = null; function startWatchingHistoryWithSSE() { if (watchingInitialized) { return; @@ -459,8 +460,10 @@ export const useHistoryStore = defineStore("historyStore", () => { longPollingInterval: INACTIVE_POLLING_INTERVAL, }, ); - stopWatchingHistoryResource = stopWatchingResource; - watch(isWatchingResource, (v) => (isWatchingHistory.value = v), { immediate: true }); + stopHistoryPolling = stopWatchingResource; + stopIsWatchingWatcher = watch(isWatchingResource, (v) => (isWatchingHistory.value = v), { + immediate: true, + }); startWatchingResource(); } }; @@ -480,11 +483,6 @@ export const useHistoryStore = defineStore("historyStore", () => { } } - function stopWatchingHistory() { - sseHistoryDisconnect(); - stopWatchingHistoryResource?.(); - } - async function loadHistoryById(historyId: string) { if (!isLoadingHistory.has(historyId)) { isLoadingHistory.add(historyId); @@ -582,6 +580,23 @@ export const useHistoryStore = defineStore("historyStore", () => { return contentStats; } + // Closes SSE and stops polling so the watcher can't emit a trailing + // anonymous-cookie request that would overwrite the authenticated + // ``galaxysession`` cookie set by the login/register response. + function stopWatchingHistory() { + sseHistoryDisconnect(); + if (stopHistoryPolling) { + stopHistoryPolling(); + stopHistoryPolling = null; + } + if (stopIsWatchingWatcher) { + stopIsWatchingWatcher(); + stopIsWatchingWatcher = null; + } + isWatchingHistory.value = false; + watchingInitialized = false; + } + return { histories, changingCurrentHistory, diff --git a/client/src/stores/notificationsStore.ts b/client/src/stores/notificationsStore.ts index fae40d7f0e02..a935b97e0b49 100644 --- a/client/src/stores/notificationsStore.ts +++ b/client/src/stores/notificationsStore.ts @@ -28,6 +28,7 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { // --- SSE setup (listen only for notification event types) --- const NOTIFICATION_EVENT_TYPES = ["notification_update", "broadcast_update", "notification_status"] as const; const { connect: sseConnect, disconnect: sseDisconnect } = useSSE(handleSSEEvent, NOTIFICATION_EVENT_TYPES); + let stopPolling: (() => void) | null = null; function handleSSEEvent(event: MessageEvent) { try { @@ -121,7 +122,6 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { // every time the tab regains focus — in SSE mode that would re-start // polling we explicitly don't want. let watchingInitialized = false; - let stopPolling: (() => void) | null = null; function ensureWatchingWithConfig() { if (watchingInitialized) { return; @@ -133,7 +133,7 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { if (configStore.config?.enable_notification_system) { sseConnect(); } else { - const { startWatchingResource: startPollingResource, stopWatchingResource } = useResourceWatcher( + const { startWatchingResource: startPolling, stopWatchingResource } = useResourceWatcher( getNotificationStatus, { shortPollingInterval: ACTIVE_POLLING_INTERVAL, @@ -141,7 +141,7 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { }, ); stopPolling = stopWatchingResource; - startPollingResource(); + startPolling(); } }; @@ -206,9 +206,17 @@ export const useNotificationsStore = defineStore("notificationsStore", () => { totalUnreadCount.value = notifications.value.filter((n) => !n.seen_time).length; } + // Closes the SSE stream and stops the polling watcher so nothing running + // in the background can outlive a full-page navigation (login/register). + // A late-arriving response from an anonymous-cookie request would otherwise + // overwrite the just-issued authenticated ``galaxysession`` cookie. function stopWatchingNotifications() { sseDisconnect(); - stopPolling?.(); + if (stopPolling) { + stopPolling(); + stopPolling = null; + } + watchingInitialized = false; } return { From cb1ef59ea6db7804c9ab3d3f9a626e784478173d Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 24 Apr 2026 09:08:23 +0200 Subject: [PATCH 24/47] Tear down selenium driver when setUp fails before login completes pytest does not call tearDown when setUp raises, so if setup_selenium fails after setup_driver_and_session (e.g. wait_for_logged_in times out during register), the Playwright instance is leaked. Its per-instance asyncio loop stays registered as "running" on the main thread and every subsequent test in the shard errors with "Sync API inside the asyncio loop", cascading the whole shard. Wrap the post-allocation work in try/except that invokes tear_down_driver before re-raising. Backend-agnostic since tear_down_driver branches on backend_type. Complements the try/finally in HasPlaywrightDriver.quit (57f1abac5ad) which only helps when quit runs at all. --- lib/galaxy_test/selenium/framework.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/lib/galaxy_test/selenium/framework.py b/lib/galaxy_test/selenium/framework.py index 6b3b775705a1..d6dff43dd36d 100644 --- a/lib/galaxy_test/selenium/framework.py +++ b/lib/galaxy_test/selenium/framework.py @@ -3,6 +3,7 @@ import datetime import errno import json +import logging import os import traceback import unittest @@ -76,6 +77,8 @@ except ImportError: GalaxyTestDriver = None # type: ignore[assignment, misc, unused-ignore] +logger = logging.getLogger(__name__) + def _load_config_file() -> None: """ @@ -433,10 +436,22 @@ def setup_selenium(self): self.target_url_from_selenium = self._target_url_from_selenium() self.snapshots = [] self.setup_driver_and_session() - if self.run_as_admin and GALAXY_TEST_SELENIUM_ADMIN_USER_EMAIL == DEFAULT_ADMIN_USER: - self._setup_interactor() - self._setup_user(GALAXY_TEST_SELENIUM_ADMIN_USER_EMAIL) - self._try_setup_with_driver() + # Once the driver is allocated, any subsequent failure must still + # tear it down: pytest does not call tearDown when setUp raises, so + # without this the Playwright asyncio loop would stay registered as + # "running" on the main thread and cascade every subsequent test's + # setUp with "Sync API inside the asyncio loop". + try: + if self.run_as_admin and GALAXY_TEST_SELENIUM_ADMIN_USER_EMAIL == DEFAULT_ADMIN_USER: + self._setup_interactor() + self._setup_user(GALAXY_TEST_SELENIUM_ADMIN_USER_EMAIL) + self._try_setup_with_driver() + except Exception: + try: + self.tear_down_driver() + except Exception: + logger.exception("Error tearing down driver after setup_selenium failure") + raise def _try_setup_with_driver(self): try: From 35c6077d7e7a3a4be69fe7e6a01ead176e3e5dd3 Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 24 Apr 2026 10:18:23 +0200 Subject: [PATCH 25/47] Extend auth-navigation abort to GalaxyApi (openapi-fetch) requests The previous login-race fix wired a shared AbortController through an axios interceptor, but most first-party API traffic now goes through GalaxyApi (openapi-fetch + native fetch), which ignored it. In-flight /api/... calls could therefore still return a clobbering Set-Cookie: galaxysession= after login. Add an openapi-fetch middleware that attaches the same shared signal to every request (combined via AbortSignal.any with any caller-set signal) and honours the SKIP_PENDING_REQUESTS_HEADER opt-out. Register it before the rate-limiter so aborted requests bypass the queue. Rename cancelPendingAxiosRequests -> cancelPendingRequests since one rotation now covers both transports. --- client/src/api/client/pendingRequestsMiddleware.ts | 5 ++++- client/src/composables/useAuthNavigation.ts | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/client/src/api/client/pendingRequestsMiddleware.ts b/client/src/api/client/pendingRequestsMiddleware.ts index 04eb4b2c11aa..b8c99c9337d7 100644 --- a/client/src/api/client/pendingRequestsMiddleware.ts +++ b/client/src/api/client/pendingRequestsMiddleware.ts @@ -18,7 +18,10 @@ export const pendingRequestsMiddleware: Middleware = { return new Request(request, { headers }); } const shared = getPendingAbortSignal(); - const signal = typeof AbortSignal.any === "function" ? AbortSignal.any([request.signal, shared]) : shared; + // Combine with any signal the caller may have set so we don't silently + // drop their cancellation semantics. + const signal = + typeof AbortSignal.any === "function" ? AbortSignal.any([request.signal, shared]) : shared; return new Request(request, { signal }); }, }; diff --git a/client/src/composables/useAuthNavigation.ts b/client/src/composables/useAuthNavigation.ts index 244d5a0012ef..69d7f88a00da 100644 --- a/client/src/composables/useAuthNavigation.ts +++ b/client/src/composables/useAuthNavigation.ts @@ -16,8 +16,9 @@ import { useNotificationsStore } from "@/stores/notificationsStore"; * will use a fresh signal and is not affected. */ export function discardActiveConnectionsBeforeAuthNavigation() { - // Stop polling watchers first so they can't kick off new fetches, then - // abort any requests still in flight via the shared AbortController. + // Order: close SSE streams first (synchronous TCP close), then stop the + // polling watchers so they can't kick off new fetches, then abort any + // requests still in flight via the shared AbortController. useHistoryStore().stopWatchingHistory(); useEntryPointStore().stopWatchingEntryPoints(); useNotificationsStore().stopWatchingNotifications(); From 6463c24e1baaf7e7ef9deb4e4b1c6235976b214f Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Fri, 24 Apr 2026 11:32:42 +0200 Subject: [PATCH 26/47] Skip SSE/polling watchers in iframed Galaxy instances MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scratchbook windows (WinBox iframes) load the same analysis route (``/datasets/X/display``) as the main page, so each one boots a full Galaxy Vue app. ``historyStore.startWatchingHistory()`` was called unconditionally in ``App.vue::setup()``, meaning every iframe opened its own EventSource to ``/api/events/stream``. With 3 long-lived SSE streams from the main page plus one per iframe, two open dataset windows is enough to saturate the HTTP/1.1 6-connections-per-origin budget and hang the tab — ``test_scratchbook_window_persistence`` hung indefinitely on every CI run of Playwright shard 1. Treat any frame where ``window.top !== window.self`` as embedded, in addition to the existing ``?embed=true`` route-query check. That suppresses history SSE in iframes (and also the existing ``startWatchingEntryPoints`` / ``startWatchingNotifications`` calls, which were already gated on ``!embedded``), leaving the per-origin connection budget untouched at 3 used / 3 free so iframes can still make regular API calls without queueing behind SSE sockets. Verified locally: test passes in 43s with SSE flags on; previously hung past the 5-minute timeout. --- client/src/entry/analysis/App.vue | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/client/src/entry/analysis/App.vue b/client/src/entry/analysis/App.vue index cbc80f181cde..64bd2a784a02 100644 --- a/client/src/entry/analysis/App.vue +++ b/client/src/entry/analysis/App.vue @@ -48,7 +48,7 @@