Skip to content

Commit bb873a0

Browse files
authored
Merge branch 'main' into feature/adfs-unitest
2 parents 517f9f4 + 4dbcd03 commit bb873a0

8 files changed

Lines changed: 610 additions & 27 deletions

File tree

.secrets.baseline

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"exclude": {
3-
"files": "package-lock.json|Cargo.lock|^.secrets.baseline$|scripts/sign_image.sh|scripts/zap|sonar-project.properties",
3+
"files": "package-lock.json|Cargo.lock|^.secrets.baseline$|scripts/sign_image.sh|scripts/zap|sonar-project.properties|^/Users/brian/dev/github.ibm.com/contextforge-org/sps-pipeline-config/.secrets.baseline$|^./.secrets.baseline$",
44
"lines": null
55
},
66
"generated_at": "2026-04-01T00:41:17Z",
@@ -6843,6 +6843,32 @@
68436843
"verified_result": null
68446844
}
68456845
],
6846+
"plugins_rust/secrets_detection/benches/secrets_detection.rs": [
6847+
{
6848+
"hashed_secret": "86de8c52637ec530fe39b0a8471da9b8764d5242",
6849+
"is_secret": false,
6850+
"is_verified": false,
6851+
"line_number": 53,
6852+
"type": "AWS Access Key",
6853+
"verified_result": null
6854+
},
6855+
{
6856+
"hashed_secret": "5b8c02feb3811310ff452e9a812b9f98873f36bf",
6857+
"is_secret": false,
6858+
"is_verified": false,
6859+
"line_number": 69,
6860+
"type": "SoftLayer Credentials",
6861+
"verified_result": null
6862+
},
6863+
{
6864+
"hashed_secret": "0fa6996ddd42e0f9db3f1c04d06453026d56da0f",
6865+
"is_secret": false,
6866+
"is_verified": false,
6867+
"line_number": 969,
6868+
"type": "Secret Keyword",
6869+
"verified_result": null
6870+
}
6871+
],
68466872
"podman-compose-sonarqube.yaml": [
68476873
{
68486874
"hashed_secret": "345e9ea7c857e75dedd9edb24c232e1cab297c19",

mcpgateway/bootstrap_db.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@
3636
import json
3737
import os
3838
from pathlib import Path
39+
import random
3940
import tempfile
41+
import time
4042
from typing import cast
4143

4244
# Third-Party
@@ -102,7 +104,7 @@ def advisory_lock(conn: Connection):
102104
Acquire a distributed advisory lock to serialize migrations across multiple instances.
103105
104106
Behavior depends on the database backend:
105-
- Postgres: Uses `pg_advisory_lock` (blocking)
107+
- Postgres: Uses `pg_try_advisory_lock` (non-blocking)
106108
- SQLite: Fallback to local `FileLock`
107109
108110
Args:
@@ -119,8 +121,34 @@ def advisory_lock(conn: Connection):
119121
pg_lock_id = 42424242424242
120122

121123
if dialect == "postgresql":
122-
logger.info("Acquiring Postgres advisory lock...")
123-
conn.execute(text(f"SELECT pg_advisory_lock({pg_lock_id})"))
124+
logger.info("Attempting to acquire Postgres advisory lock...")
125+
126+
# Retry parameters
127+
max_retries = 60 # 60 attempts
128+
base_delay = 1.0 # Start with 1 second
129+
max_delay = 10.0 # Cap at 10 seconds
130+
131+
acquired = False
132+
for attempt in range(max_retries):
133+
# Try non-blocking lock
134+
result = conn.execute(text(f"SELECT pg_try_advisory_lock({pg_lock_id})"))
135+
acquired = result.scalar()
136+
137+
if acquired:
138+
logger.info(f"Acquired Postgres advisory lock on attempt {attempt + 1}")
139+
break
140+
141+
# Exponential backoff with jitter
142+
delay = min(base_delay * (1.5**attempt), max_delay)
143+
jitter = delay * random.uniform(-0.1, 0.1) # nosec B311 # noqa: DUO102
144+
sleep_time = delay + jitter
145+
146+
logger.info(f"Lock held by another instance, retrying in {sleep_time:.1f}s (attempt {attempt + 1}/{max_retries})")
147+
time.sleep(sleep_time)
148+
149+
if not acquired:
150+
raise TimeoutError(f"Failed to acquire advisory lock after {max_retries} attempts")
151+
124152
try:
125153
yield
126154
finally:

mcpgateway/services/gateway_service.py

Lines changed: 83 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,10 @@ def __init__(self) -> None:
476476
self._leader_ttl = settings.redis_leader_ttl
477477
self._leader_heartbeat_interval = settings.redis_leader_heartbeat_interval
478478
self._leader_heartbeat_task: Optional[asyncio.Task] = None
479+
self._follower_election_task: Optional[asyncio.Task] = None
480+
481+
# Log instance mapping for debugging
482+
logger.info(f"Instance started: instance_id={self._instance_id}, port={settings.port}, pid={os.getpid()}")
479483

480484
# Always initialize file lock as fallback (used if Redis connection fails at runtime)
481485
if settings.cache_type != "none":
@@ -588,8 +592,12 @@ async def initialize(self) -> None:
588592
logger.info("Acquired Redis leadership. Starting health check and heartbeat tasks.")
589593
self._health_check_task = asyncio.create_task(self._run_health_checks(user_email))
590594
self._leader_heartbeat_task = asyncio.create_task(self._run_leader_heartbeat())
595+
else:
596+
# Did not acquire leadership - start follower election loop
597+
logger.info("Did not acquire leadership. Starting follower election loop.")
598+
self._follower_election_task = asyncio.create_task(self._run_follower_election(user_email))
591599
else:
592-
# Always create the health check task in filelock mode; leader check is handled inside.
600+
# No Redis available - always create the health check task in filelock mode
593601
self._health_check_task = asyncio.create_task(self._run_health_checks(user_email))
594602

595603
async def shutdown(self) -> None:
@@ -608,14 +616,25 @@ async def shutdown(self) -> None:
608616
>>> len(service._active_gateways)
609617
0
610618
"""
619+
# Cancel follower election FIRST to prevent it from spawning new
620+
# health-check / heartbeat tasks while we are tearing down.
621+
if getattr(self, "_follower_election_task", None):
622+
self._follower_election_task.cancel()
623+
try:
624+
await self._follower_election_task
625+
except asyncio.CancelledError:
626+
pass
627+
628+
# Now safe to cancel health-check and heartbeat (handles may have been
629+
# overwritten by follower election just before cancellation — that is fine,
630+
# we always cancel whichever task the attribute currently points to).
611631
if self._health_check_task:
612632
self._health_check_task.cancel()
613633
try:
614634
await self._health_check_task
615635
except asyncio.CancelledError:
616636
pass
617637

618-
# Cancel leader heartbeat task if running
619638
if getattr(self, "_leader_heartbeat_task", None):
620639
self._leader_heartbeat_task.cancel()
621640
try:
@@ -3901,34 +3920,87 @@ def get_first_gateway_by_url(self, db: Session, url: str, team_id: Optional[str]
39013920
return self.convert_gateway_to_read(result)
39023921

39033922
async def _run_leader_heartbeat(self) -> None:
3904-
"""Run leader heartbeat loop to keep leader key alive.
3905-
3906-
This runs independently from health checks to ensure the leader key
3907-
is refreshed frequently enough (every redis_leader_heartbeat_interval seconds)
3908-
to prevent expiration during long-running health check operations.
3923+
"""Run leader heartbeat loop with Redis reconnection support.
39093924
3910-
The loop exits if this instance loses leadership.
3925+
Refreshes the leader key TTL every heartbeat interval. Exits and starts
3926+
follower election if leadership is lost or after consecutive failures.
39113927
"""
3928+
consecutive_failures = 0
3929+
max_failures = 3
3930+
39123931
while True:
39133932
try:
39143933
await asyncio.sleep(self._leader_heartbeat_interval)
39153934

39163935
if not self._redis_client:
3917-
return
3936+
logger.warning("Redis client unavailable in heartbeat")
3937+
consecutive_failures += 1
3938+
if consecutive_failures >= max_failures:
3939+
logger.error("Lost Redis connection, stopping heartbeat")
3940+
return
3941+
continue
39183942

39193943
# Check if we're still the leader
39203944
current_leader = await self._redis_client.get(self._leader_key)
39213945
if current_leader != self._instance_id:
39223946
logger.info("Lost Redis leadership, stopping heartbeat")
3947+
self._start_follower_election()
39233948
return
39243949

39253950
# Refresh the leader key TTL
39263951
await self._redis_client.expire(self._leader_key, self._leader_ttl)
39273952
logger.debug(f"Leader heartbeat: refreshed TTL to {self._leader_ttl}s")
3953+
consecutive_failures = 0
3954+
3955+
except Exception as e:
3956+
consecutive_failures += 1
3957+
logger.warning(f"Leader heartbeat error (failure {consecutive_failures}/{max_failures}): {e}")
3958+
if consecutive_failures >= max_failures:
3959+
logger.error("Too many consecutive heartbeat failures, starting follower election")
3960+
self._start_follower_election()
3961+
return
3962+
3963+
def _start_follower_election(self) -> None:
3964+
"""Start a follower election task if one is not already running."""
3965+
if self._follower_election_task is None or self._follower_election_task.done():
3966+
self._follower_election_task = asyncio.create_task(self._run_follower_election(settings.platform_admin_email))
3967+
3968+
async def _run_follower_election(self, user_email: str) -> None:
3969+
"""Continuously attempt to acquire leadership when not the leader.
3970+
3971+
This runs on follower instances and polls Redis to claim leadership
3972+
when the current leader key expires or becomes available.
3973+
3974+
Args:
3975+
user_email: Email of the user for OAuth token lookup
3976+
"""
3977+
retry_interval = max(1, self._leader_ttl // 3) # Poll at 1/3 of TTL
3978+
3979+
while True:
3980+
try:
3981+
await asyncio.sleep(retry_interval)
3982+
3983+
if not self._redis_client:
3984+
logger.warning("Redis client unavailable, cannot attempt election.")
3985+
continue
3986+
3987+
# Attempt to acquire leadership
3988+
is_leader = await self._redis_client.set(self._leader_key, self._instance_id, ex=self._leader_ttl, nx=True)
3989+
3990+
if is_leader:
3991+
logger.info("Acquired Redis leadership via follower election. Starting health check and heartbeat.")
3992+
# Cancel stale tasks from a previous leadership period to prevent
3993+
# orphaned loops running alongside the new ones.
3994+
if self._health_check_task and not self._health_check_task.done():
3995+
self._health_check_task.cancel()
3996+
if getattr(self, "_leader_heartbeat_task", None) and not self._leader_heartbeat_task.done():
3997+
self._leader_heartbeat_task.cancel()
3998+
self._health_check_task = asyncio.create_task(self._run_health_checks(user_email))
3999+
self._leader_heartbeat_task = asyncio.create_task(self._run_leader_heartbeat())
4000+
return # Exit follower loop, now running as leader
39284001

39294002
except Exception as e:
3930-
logger.warning(f"Leader heartbeat error: {e}")
3931-
# Continue trying - the main health check loop will handle leadership loss
4003+
logger.warning(f"Follower election error: {e}", exc_info=True)
39324004

39334005
async def _run_health_checks(self, user_email: str) -> None:
39344006
"""Run health checks periodically,

tests/js/admin.test.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ let escapeHtml;
1010
beforeAll(() => {
1111
const win = loadAdminJs();
1212
escapeHtml = win.escapeHtml;
13-
});
13+
}, 5000);
1414

1515
afterAll(() => {
1616
cleanupAdminJs();

tests/js/helpers/admin-env.js

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,26 @@ let instrumentedCode = null;
3737
export function loadAdminJs(options = {}) {
3838
if (!instrumentedCode) {
3939
const adminJsContent = fs.readFileSync(adminJsPath, "utf8");
40-
const instrumenter = createInstrumenter({
41-
compact: false,
42-
esModules: false,
43-
coverageVariable: "__coverage__",
44-
});
45-
instrumentedCode = instrumenter.instrumentSync(
46-
adminJsContent,
47-
adminJsPath,
48-
);
40+
// Skip instrumentation when not collecting coverage — instrumenting a
41+
// ~1.3 MB file takes several seconds and is only needed for coverage reports.
42+
// process.argv is unreliable in Vitest workers (spawned as separate processes),
43+
// so fall back to Vitest's internal worker state.
44+
const isCoverageRun =
45+
process.argv.includes("--coverage") ||
46+
globalThis.__vitest_worker__?.config?.coverage?.enabled === true;
47+
if (isCoverageRun) {
48+
const instrumenter = createInstrumenter({
49+
compact: false,
50+
esModules: false,
51+
coverageVariable: "__coverage__",
52+
});
53+
instrumentedCode = instrumenter.instrumentSync(
54+
adminJsContent,
55+
adminJsPath,
56+
);
57+
} else {
58+
instrumentedCode = adminJsContent;
59+
}
4960
}
5061

5162
dom = new JSDOM("<!DOCTYPE html><html><body></body></html>", {

tests/unit/mcpgateway/services/test_gateway_service.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7240,6 +7240,7 @@ async def test_heartbeat_no_redis(self, gateway_service):
72407240
gateway_service._leader_ttl = 30
72417241
gateway_service._redis_client = None
72427242
gateway_service._leader_heartbeat_interval = 0
7243+
gateway_service._follower_election_task = None
72437244
await gateway_service._run_leader_heartbeat()
72447245

72457246
@pytest.mark.asyncio
@@ -7251,14 +7252,17 @@ async def test_heartbeat_lost_leadership(self, gateway_service):
72517252
gateway_service._redis_client = AsyncMock()
72527253
gateway_service._redis_client.get = AsyncMock(return_value="other-leader")
72537254
gateway_service._leader_heartbeat_interval = 0
7254-
await gateway_service._run_leader_heartbeat()
7255+
gateway_service._follower_election_task = None
7256+
with patch.object(gateway_service, "_start_follower_election"):
7257+
await gateway_service._run_leader_heartbeat()
72557258

72567259
@pytest.mark.asyncio
72577260
async def test_heartbeat_refreshes_ttl(self, gateway_service):
72587261
"""Heartbeat refreshes TTL then exits when losing leadership."""
72597262
gateway_service._instance_id = "test-id"
72607263
gateway_service._leader_key = "leader:health_check"
72617264
gateway_service._leader_ttl = 30
7265+
gateway_service._follower_election_task = None
72627266
call_count = 0
72637267

72647268
async def mock_get(*args):
@@ -7272,7 +7276,8 @@ async def mock_get(*args):
72727276
gateway_service._redis_client.get = mock_get
72737277
gateway_service._redis_client.expire = AsyncMock()
72747278
gateway_service._leader_heartbeat_interval = 0
7275-
await gateway_service._run_leader_heartbeat()
7279+
with patch.object(gateway_service, "_start_follower_election"):
7280+
await gateway_service._run_leader_heartbeat()
72767281
gateway_service._redis_client.expire.assert_awaited_once()
72777282

72787283

0 commit comments

Comments
 (0)