Skip to content

Commit 8719bfd

Browse files
committed
Per-project conda env + multi-tenant isolation hardening
Each new project now provisions its own conda env at <project_dir>/.env by cloning the configured base env (default: ark). Plus hardening to ensure projects can't read each other's keys, packages, or state. ## Per-project conda env * New status `initializing` while the env clones in the background. webapp returns immediately; orchestrator launch waits for env ready. * Status surfaced in dashboard (badge, status line, all 3 i18n locales) and in Project Info ("Conda Env: <prefix path>"). * Background task sends Telegram updates: starting, env ready, failure. * `provision_project_env` clones via `conda create --prefix --clone`, logs to `<pdir>/logs/env_provision.log`, idempotent on retry. * `launch_local_job` and slurm template prefer `<pdir>/.env` via `--prefix`; legacy projects fall back to the shared env. * `LocalBackend.conda_env` / `SlurmBackend.conda_env` point the agent prompt at the project-local env when present. * Delete works unchanged: `shutil.rmtree(pdir)` removes the env too. ## Robust binary discovery (systemd PATH safe) Add `find_conda_binary()` and `find_claude_binary()` that fall back to `$CONDA_EXE`, common miniforge/anaconda paths, and `~/.nvm/.../bin` when `shutil.which()` fails under systemd's bare PATH. Add `build_subprocess_path()` to enrich orchestrator subprocess `PATH` with claude bin, ~/.local/bin, and texlive 2025. ## Multi-tenant isolation * `PYTHONNOUSERSITE=1` for orchestrator + verify subprocesses so no project can read packages from /home/<user>/.local cross-tenant. * `ARK_NO_GLOBAL_CONFIG=1` env var (read by `deep_research.get_gemini_api_key` and `telegram.TelegramConfig._load_global`) blocks fallback to lab-wide configs when running under the webapp; each project must use only the keys passed in by its owning user. * `verify_claude_cli` runs with isolated HOME, full PATH enrichment, and resolves the absolute claude binary path so settings-page key verification works under systemd. ## Escape hatch + config * `ARK_FORCE_LOCAL=1` makes `slurm_available()` return False, used while the SLURM controller spool is full. * New setting `PROJECT_BASE_CONDA_ENV` (default `ark`) controls which env new projects clone from. Empty disables provisioning entirely.
1 parent a486e9c commit 8719bfd

File tree

10 files changed

+423
-63
lines changed

10 files changed

+423
-63
lines changed

ark/compute.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,14 @@ def job_prefix(self) -> str:
9999

100100
@property
101101
def conda_env(self) -> str:
102-
return (self._compute_config.get("conda_env")
103-
or self.config.get("conda_env")
104-
or self.project_name)
102+
explicit = (self._compute_config.get("conda_env")
103+
or self.config.get("conda_env"))
104+
if explicit:
105+
return explicit
106+
local_env = self.code_dir / ".env"
107+
if (local_env / "conda-meta").is_dir():
108+
return str(local_env)
109+
return self.project_name
105110

106111
@property
107112
def slurm_template(self) -> str:
@@ -202,9 +207,15 @@ class LocalBackend(ComputeBackend):
202207

203208
@property
204209
def conda_env(self) -> str:
205-
return (self._compute_config.get("conda_env")
206-
or self.config.get("conda_env")
207-
or self.project_name)
210+
explicit = (self._compute_config.get("conda_env")
211+
or self.config.get("conda_env"))
212+
if explicit:
213+
return explicit
214+
# Per-project env created by the webapp at <code_dir>/.env.
215+
local_env = self.code_dir / ".env"
216+
if (local_env / "conda-meta").is_dir():
217+
return str(local_env)
218+
return self.project_name
208219

209220
@property
210221
def gpu_count(self) -> int:

ark/deep_research.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,25 @@ def _global_config() -> Path:
2222

2323

2424
def get_gemini_api_key() -> str:
25-
"""Get Gemini API key from env var or global config."""
25+
"""
26+
Get Gemini API key from env var or global config.
27+
28+
When ``ARK_NO_GLOBAL_CONFIG=1`` is set (which the webapp does for every
29+
orchestrator subprocess), the global ``.ark/config.yaml`` fallback is
30+
skipped — only env vars are honored. This prevents one webapp user's
31+
project from silently using another user's (or the lab admin's)
32+
Gemini key when they haven't configured their own.
33+
"""
2634
# 1. Environment variable
2735
key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
2836
if key:
2937
return key
3038

31-
# 2. Global config
39+
# 2. Global config (skipped under webapp / multi-user mode)
40+
no_global = os.environ.get("ARK_NO_GLOBAL_CONFIG", "").strip().lower()
41+
if no_global and no_global not in ("0", "false", "no", "off"):
42+
return ""
43+
3244
if _global_config().exists():
3345
try:
3446
with open(_global_config()) as f:

ark/telegram.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,13 @@ def __init__(self, project_config: dict = None):
4444

4545
def _load_global(self) -> dict:
4646
if self._global is None:
47+
# In webapp / multi-user mode, never read the lab-wide telegram
48+
# config: each project must use its own bot/chat from the
49+
# project record.
50+
no_global = os.environ.get("ARK_NO_GLOBAL_CONFIG", "").strip().lower()
51+
if no_global and no_global not in ("0", "false", "no", "off"):
52+
self._global = {}
53+
return self._global
4754
if self._global_config_path().exists():
4855
try:
4956
with open(self._global_config_path()) as f:

ark/webapp/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ def _env_file() -> Path:
2828
"SLURM_PARTITION": "",
2929
"SLURM_ACCOUNT": "",
3030
"SLURM_CONDA_ENV": "ark",
31+
# Base conda env that each new project's per-project env is cloned from.
32+
# Defaults to "ark" because that's the env that historically ran the
33+
# orchestrator and has all the heavy deps (google-genai, numpy, pandas,
34+
# matplotlib, weasyprint, pytest, …) that ark-dev/ark-prod intentionally
35+
# omit. Set to empty to disable per-project env provisioning entirely.
36+
"PROJECT_BASE_CONDA_ENV": "ark",
3137
"GOOGLE_CLIENT_ID": "",
3238
"GOOGLE_CLIENT_SECRET": "",
3339
}
@@ -121,6 +127,7 @@ def __init__(self):
121127
self.slurm_partition: str = merged.get("SLURM_PARTITION", "")
122128
self.slurm_account: str = merged.get("SLURM_ACCOUNT", "")
123129
self.slurm_conda_env: str = merged.get("SLURM_CONDA_ENV", "ark")
130+
self.project_base_conda_env: str = merged.get("PROJECT_BASE_CONDA_ENV", "ark")
124131
self.slurm_gres: str = merged.get("SLURM_GRES", "")
125132
self.slurm_cpus_per_task: int = int(merged.get("SLURM_CPUS_PER_TASK", "4"))
126133
raw_domains = merged.get("EMAIL_DOMAINS", "")

ark/webapp/db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def get_all_projects(session: Session) -> list[Project]:
140140

141141
def get_running_projects(session: Session) -> list[Project]:
142142
return list(session.exec(
143-
select(Project).where(Project.status.in_(["queued", "running", "pending"]))
143+
select(Project).where(Project.status.in_(["queued", "running", "pending", "initializing"]))
144144
).all())
145145

146146

ark/webapp/jobs.py

Lines changed: 176 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,162 @@
1616

1717
_SLURM_TEMPLATE = Path(__file__).parent / "slurm_template.sh"
1818

19+
# Per-project conda env: lives at <project_dir>/.env as a `--prefix` env cloned
20+
# from the configured base env. Detected via the conda-meta directory which
21+
# every conda env has, even an empty one.
22+
PROJECT_ENV_DIRNAME = ".env"
23+
24+
25+
def project_env_prefix(project_dir: Path) -> Path:
26+
return Path(project_dir) / PROJECT_ENV_DIRNAME
27+
28+
29+
def project_env_ready(project_dir: Path) -> bool:
30+
return (project_env_prefix(project_dir) / "conda-meta").is_dir()
31+
32+
33+
def find_claude_binary() -> str | None:
34+
"""
35+
Locate the ``claude`` CLI even when systemd's bare PATH doesn't include
36+
the user's nvm/npm bin dir. Tries shutil.which, $HOME/.local/bin, then
37+
every nvm node version's bin dir, then a few common npm prefixes.
38+
"""
39+
found = shutil.which("claude")
40+
if found:
41+
return found
42+
home = Path(os.path.expanduser("~"))
43+
candidates: list[Path] = [home / ".local" / "bin" / "claude"]
44+
nvm_dir = home / ".nvm" / "versions" / "node"
45+
if nvm_dir.is_dir():
46+
# Newest version first so we pick the actively used one.
47+
try:
48+
for v in sorted(nvm_dir.iterdir(), reverse=True):
49+
candidates.append(v / "bin" / "claude")
50+
except OSError:
51+
pass
52+
candidates += [
53+
home / ".npm-global" / "bin" / "claude",
54+
Path("/usr/local/bin/claude"),
55+
]
56+
for c in candidates:
57+
if c.is_file() and os.access(c, os.X_OK):
58+
return str(c)
59+
return None
60+
61+
62+
def build_subprocess_path(extra: list[str] | None = None) -> str:
63+
"""
64+
Build a PATH string suitable for spawning ARK subprocesses (orchestrator,
65+
claude CLI, etc.) when the parent process has a bare systemd PATH.
66+
Prepends: claude binary dir, ~/.local/bin, texlive 2025 bin, plus any
67+
caller-supplied dirs, then the existing PATH.
68+
"""
69+
parts: list[str] = list(extra or [])
70+
home = Path(os.path.expanduser("~"))
71+
72+
claude = find_claude_binary()
73+
if claude:
74+
parts.append(str(Path(claude).parent))
75+
76+
parts.append(str(home / ".local" / "bin"))
77+
78+
texlive = home / "texlive" / "2025" / "bin" / "x86_64-linux"
79+
if texlive.is_dir():
80+
parts.append(str(texlive))
81+
82+
existing = os.environ.get("PATH", "/usr/bin:/bin")
83+
# Dedupe while preserving order.
84+
seen: set[str] = set()
85+
out: list[str] = []
86+
for p in parts + existing.split(":"):
87+
if p and p not in seen:
88+
seen.add(p)
89+
out.append(p)
90+
return ":".join(out)
91+
92+
93+
def find_conda_binary() -> str | None:
94+
"""
95+
Locate the conda binary even when PATH is bare (e.g. systemd unit with no
96+
Environment=PATH=). Tries shutil.which, then $CONDA_EXE, then common
97+
install prefixes under $HOME.
98+
"""
99+
found = shutil.which("conda")
100+
if found:
101+
return found
102+
env_var = os.environ.get("CONDA_EXE")
103+
if env_var and Path(env_var).is_file():
104+
return env_var
105+
home = Path(os.path.expanduser("~"))
106+
for candidate in (
107+
home / "miniforge3" / "condabin" / "conda",
108+
home / "miniforge3" / "bin" / "conda",
109+
home / "miniconda3" / "condabin" / "conda",
110+
home / "miniconda3" / "bin" / "conda",
111+
home / "anaconda3" / "condabin" / "conda",
112+
home / "anaconda3" / "bin" / "conda",
113+
Path("/opt/conda/bin/conda"),
114+
):
115+
if candidate.is_file():
116+
return str(candidate)
117+
return None
118+
119+
120+
def provision_project_env(project_dir: Path, base_env: str = "ark-dev",
121+
log_path: Path | None = None) -> tuple[bool, str]:
122+
"""
123+
Create a per-project conda env at <project_dir>/.env by cloning ``base_env``.
124+
125+
Returns ``(success, message)``. Idempotent: returns success immediately if
126+
the env is already present. Writes the conda command output to ``log_path``
127+
(or <project_dir>/.env_provision.log) for debugging.
128+
"""
129+
project_dir = Path(project_dir)
130+
target = project_env_prefix(project_dir)
131+
log_path = Path(log_path) if log_path else (project_dir / ".env_provision.log")
132+
log_path.parent.mkdir(parents=True, exist_ok=True)
133+
134+
if project_env_ready(project_dir):
135+
return True, f"already exists at {target}"
136+
137+
conda_bin = find_conda_binary()
138+
if not conda_bin:
139+
msg = ("conda binary not found (checked PATH, $CONDA_EXE, and common "
140+
"miniforge/anaconda locations); cannot provision project env")
141+
log_path.write_text(msg + "\n")
142+
return False, msg
143+
144+
# Stale partial env from a prior failed clone — wipe before retrying.
145+
if target.exists():
146+
shutil.rmtree(target, ignore_errors=True)
147+
148+
cmd = [conda_bin, "create", "--prefix", str(target),
149+
"--clone", base_env, "--yes"]
150+
started = time.time()
151+
try:
152+
with open(log_path, "w") as lf:
153+
lf.write(f"$ {' '.join(cmd)}\n")
154+
lf.flush()
155+
proc = subprocess.run(cmd, stdout=lf, stderr=subprocess.STDOUT)
156+
elapsed = time.time() - started
157+
if proc.returncode == 0 and project_env_ready(project_dir):
158+
return True, f"cloned {base_env} in {elapsed:.1f}s"
159+
return False, f"conda create failed (rc={proc.returncode}); see {log_path}"
160+
except Exception as e:
161+
return False, f"conda create raised {type(e).__name__}: {e}"
162+
19163

20164
def _run(cmd: list[str], **kwargs) -> subprocess.CompletedProcess:
21165
return subprocess.run(cmd, capture_output=True, text=True, **kwargs)
22166

23167

24168
def slurm_available() -> bool:
169+
# Escape hatch: set ARK_FORCE_LOCAL=1 to bypass SLURM submission and run
170+
# everything as local subprocesses (useful when slurmctld spool is full or
171+
# the cluster is unavailable). Falsy values ("", "0", "false") are ignored.
172+
force_local = os.environ.get("ARK_FORCE_LOCAL", "").strip().lower()
173+
if force_local and force_local not in ("0", "false", "no", "off"):
174+
return False
25175
return shutil.which("sbatch") is not None
26176

27177

@@ -202,11 +352,18 @@ def launch_local_job(
202352
exit_file = log_dir / "local_exit.txt"
203353
exit_file.unlink(missing_ok=True)
204354

205-
# Build the orchestrator command, preferring the configured conda env.
206-
conda_env = getattr(settings, "slurm_conda_env", "") or ""
207-
conda_bin = shutil.which("conda") if conda_env else None
208-
if conda_bin and conda_env:
209-
python_prefix = [conda_bin, "run", "--no-capture-output", "-n", conda_env, "python"]
355+
# Build the orchestrator command, preferring the project-local conda env
356+
# at <project_dir>/.env. Falls back to the named env from settings, then
357+
# the webapp's own interpreter.
358+
conda_bin = find_conda_binary()
359+
local_env = project_env_prefix(project_dir)
360+
fallback_env = getattr(settings, "slurm_conda_env", "") or ""
361+
if conda_bin and project_env_ready(project_dir):
362+
python_prefix = [conda_bin, "run", "--no-capture-output",
363+
"--prefix", str(local_env), "python"]
364+
elif conda_bin and fallback_env:
365+
python_prefix = [conda_bin, "run", "--no-capture-output",
366+
"-n", fallback_env, "python"]
210367
else:
211368
python_prefix = [sys.executable]
212369

@@ -241,6 +398,20 @@ def launch_local_job(
241398

242399
env["HOME"] = str(project_dir)
243400
env["XDG_CONFIG_HOME"] = str(project_dir / ".config")
401+
# Disable Python's pip user-site discovery so projects are completely
402+
# isolated — no project can read packages from /home/xinj/.local/... or
403+
# any other user's user-site. The cloned per-project conda env is the
404+
# ONLY source of Python packages for the orchestrator.
405+
env["PYTHONNOUSERSITE"] = "1"
406+
# Tell the orchestrator (and any ark.* code it loads) to NEVER fall back
407+
# to lab-wide configs like /home/xinj/ARK/.ark/config.yaml. Each project
408+
# must use only the api keys / oauth tokens that the webapp passed in
409+
# via env vars from the project's owning user.
410+
env["ARK_NO_GLOBAL_CONFIG"] = "1"
411+
# Make sure the orchestrator's PATH can find the claude CLI (lives in
412+
# ~/.nvm/.../bin), latexmk (~/.local/bin), and pdflatex (~/texlive/2025/...).
413+
# systemd's bare PATH doesn't include any of these.
414+
env["PATH"] = build_subprocess_path()
244415

245416
with open(log_file, "w") as lf:
246417
proc = subprocess.Popen(

0 commit comments

Comments
 (0)