Skip to content

Commit 18b1847

Browse files
abidlabsclaude
andcommitted
Merge main into feature/run-id-resume-main
Resolves conflicts in trackio/__init__.py and trackio/run.py, integrating the Space-backed resume lookups from #506 with the run_id-based resume logic on this branch. Space-backed init now queries /get_runs_for_project and /get_run_summary via the RemoteClient and passes the resolved run_id (when available) so step continuity works for multi-run-per-name projects. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2 parents 5552f57 + 498bbc4 commit 18b1847

5 files changed

Lines changed: 161 additions & 7 deletions

File tree

.changeset/few-bars-find.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trackio": patch
3+
---
4+
5+
feat:Scope bucket sync to trackio/ subtree to avoid walking the HF cache

tests/e2e-spaces/test_sync_and_freeze.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ def _namespace_scoped_repo_id(test_space_id: str, repo_name: str) -> str:
5656
return repo_name
5757

5858

59+
def _repo_safe_suffix(nbytes: int = 6) -> str:
60+
return secrets.token_hex(nbytes)
61+
62+
5963
def test_sync_to_gradio_space(test_space_id, temp_dir):
6064
project_name = f"test_sync_gradio_{secrets.token_urlsafe(8)}"
6165
run_name = "run1"
@@ -90,7 +94,7 @@ def test_sync_to_gradio_space(test_space_id, temp_dir):
9094
def test_sync_to_static_space_incremental(test_space_id, temp_dir):
9195
project_name = f"test_sync_static_{secrets.token_urlsafe(8)}"
9296
run_name = "run1"
93-
suffix = secrets.token_urlsafe(6)
97+
suffix = _repo_safe_suffix()
9498
space_id = _namespace_scoped_repo_id(test_space_id, f"trackio-test-static-{suffix}")
9599
space_id, _, bucket_id = utils.preprocess_space_and_dataset_ids(space_id, None)
96100

@@ -137,7 +141,7 @@ def test_sync_gradio_then_freeze_to_static(test_space_id, temp_dir):
137141
client.predict(api_name="/force_sync")
138142
time.sleep(5)
139143

140-
suffix = secrets.token_urlsafe(6)
144+
suffix = _repo_safe_suffix()
141145
frozen_space_id = _namespace_scoped_repo_id(
142146
test_space_id, f"trackio-test-frozen-{suffix}"
143147
)

trackio/__init__.py

Lines changed: 137 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,29 @@ def _cleanup_current_run():
9797
pass
9898

9999

100-
def _safe_get_runs_for_init(project: str) -> list[str]:
100+
def _safe_get_runs_for_init(
101+
project: str,
102+
space_id: str | None,
103+
resume: str,
104+
remote_client: RemoteClient | None = None,
105+
check_existing_for_never: bool = False,
106+
) -> list[str]:
107+
if space_id is not None:
108+
if resume == "never" and not check_existing_for_never:
109+
return []
110+
try:
111+
client = remote_client or RemoteClient(
112+
space_id,
113+
hf_token=huggingface_hub.utils.get_token(),
114+
verbose=False,
115+
)
116+
runs = client.predict(project=project, api_name="/get_runs_for_project")
117+
return runs if isinstance(runs, list) else []
118+
except Exception as e:
119+
_emit_nonfatal_warning(
120+
f"trackio.init() could not inspect existing runs for project '{project}' on Space '{space_id}': {e}. Continuing without resume metadata."
121+
)
122+
return []
101123
try:
102124
return SQLiteStorage.get_runs(project)
103125
except Exception as e:
@@ -107,7 +129,32 @@ def _safe_get_runs_for_init(project: str) -> list[str]:
107129
return []
108130

109131

110-
def _safe_get_latest_run_for_init(project: str, name: str) -> dict | None:
132+
def _safe_get_latest_run_for_init(
133+
project: str,
134+
name: str,
135+
space_id: str | None = None,
136+
remote_client: RemoteClient | None = None,
137+
) -> dict | None:
138+
if space_id is not None:
139+
try:
140+
client = remote_client or RemoteClient(
141+
space_id,
142+
hf_token=huggingface_hub.utils.get_token(),
143+
verbose=False,
144+
)
145+
runs = client.predict(project=project, api_name="/get_runs_for_project")
146+
if not isinstance(runs, list):
147+
return None
148+
matches = [r for r in runs if isinstance(r, dict) and r.get("name") == name]
149+
if not matches:
150+
return None
151+
matches.sort(key=lambda r: r.get("created_at") or "", reverse=True)
152+
return matches[0]
153+
except Exception as e:
154+
_emit_nonfatal_warning(
155+
f"trackio.init() could not inspect existing runs for project '{project}' on Space '{space_id}': {e}. Continuing without resume metadata."
156+
)
157+
return None
111158
try:
112159
return SQLiteStorage.get_latest_run_record_by_name(project, name)
113160
except Exception as e:
@@ -117,6 +164,50 @@ def _safe_get_latest_run_for_init(project: str, name: str) -> dict | None:
117164
return None
118165

119166

167+
def _safe_get_last_step_for_init(
168+
project: str,
169+
run_name: str,
170+
space_id: str | None,
171+
resumed: bool,
172+
run_id: str | None = None,
173+
remote_client: RemoteClient | None = None,
174+
) -> int | None:
175+
if not resumed:
176+
return None
177+
if space_id is not None:
178+
try:
179+
client = remote_client or RemoteClient(
180+
space_id,
181+
hf_token=huggingface_hub.utils.get_token(),
182+
verbose=False,
183+
)
184+
summary_kwargs: dict[str, Any] = {
185+
"project": project,
186+
"api_name": "/get_run_summary",
187+
}
188+
if run_id is not None:
189+
summary_kwargs["run_id"] = run_id
190+
else:
191+
summary_kwargs["run"] = run_name
192+
summary = client.predict(**summary_kwargs)
193+
if isinstance(summary, dict):
194+
last_step = summary.get("last_step")
195+
return last_step if isinstance(last_step, int) else None
196+
return None
197+
except Exception as e:
198+
_emit_nonfatal_warning(
199+
f"trackio.init() could not recover the previous step for run '{run_name}' on Space '{space_id}': {e}. Continuing from step 0."
200+
)
201+
return None
202+
try:
203+
return SQLiteStorage.get_max_step_for_run(project, run_name, run_id=run_id)
204+
except Exception as e:
205+
_emit_nonfatal_warning(
206+
f"trackio.init() could not recover the previous step for run '{run_name}': {e}. Continuing from step 0."
207+
)
208+
return None
209+
210+
120211
def init(
121212
project: str,
122213
name: str | None = None,
@@ -298,8 +389,36 @@ def init(
298389
)
299390
context_vars.current_project.set(project)
300391

392+
remote_client = None
393+
if space_id is not None:
394+
try:
395+
remote_client = RemoteClient(
396+
space_id,
397+
hf_token=huggingface_hub.utils.get_token(),
398+
verbose=False,
399+
)
400+
except Exception as e:
401+
_emit_nonfatal_warning(
402+
f"trackio.init() could not create a Space client for '{space_id}': {e}. Continuing with local fallback metadata lookups."
403+
)
404+
405+
existing_run_records = _safe_get_runs_for_init(
406+
project,
407+
space_id,
408+
resume,
409+
remote_client=remote_client,
410+
check_existing_for_never=name is not None,
411+
)
412+
existing_runs = [
413+
r["name"] if isinstance(r, dict) else r for r in existing_run_records
414+
]
415+
301416
existing_run = (
302-
_safe_get_latest_run_for_init(project, name) if name is not None else None
417+
_safe_get_latest_run_for_init(
418+
project, name, space_id=space_id, remote_client=remote_client
419+
)
420+
if name is not None
421+
else None
303422
)
304423
resolved_run_id = None
305424

@@ -319,6 +438,19 @@ def init(
319438
else:
320439
raise ValueError("resume must be one of: 'must', 'allow', or 'never'")
321440

441+
initial_last_step = (
442+
_safe_get_last_step_for_init(
443+
project,
444+
name,
445+
space_id,
446+
resumed,
447+
run_id=resolved_run_id,
448+
remote_client=remote_client,
449+
)
450+
if name is not None
451+
else None
452+
)
453+
322454
if auto_log_gpu is None:
323455
nvidia_available = gpu_available()
324456
apple_available = apple_gpu_available()
@@ -342,6 +474,8 @@ def init(
342474
group=group,
343475
config=config,
344476
space_id=space_id,
477+
existing_runs=existing_runs,
478+
initial_last_step=initial_last_step,
345479
auto_log_gpu=auto_log_gpu,
346480
gpu_log_interval=gpu_log_interval,
347481
webhook_url=webhook_url,

trackio/bucket_storage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ def _list_bucket_file_paths(bucket_id: str, prefix: str | None = None) -> list[s
2525
def download_bucket_to_trackio_dir(bucket_id: str) -> None:
2626
TRACKIO_DIR.mkdir(parents=True, exist_ok=True)
2727
sync_bucket(
28-
source=f"hf://buckets/{bucket_id}",
29-
dest=str(TRACKIO_DIR.parent),
28+
source=f"hf://buckets/{bucket_id}/trackio",
29+
dest=str(TRACKIO_DIR),
3030
quiet=True,
3131
)
3232

trackio/run.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ def __init__(
4343
group: str | None = None,
4444
config: dict | None = None,
4545
space_id: str | None = None,
46+
existing_runs: list[str] | None = None,
47+
initial_last_step: int | None = None,
4648
auto_log_gpu: bool = False,
4749
gpu_log_interval: float = 10.0,
4850
webhook_url: str | None = None,
@@ -66,6 +68,9 @@ def __init__(
6668
Keys starting with '_' are reserved for internal use.
6769
space_id: The HF Space ID if logging to a Space (e.g., "user/space").
6870
If provided, media files will be uploaded to the Space.
71+
existing_runs: Optional pre-fetched run names for this project. Used to
72+
avoid redundant storage or remote lookups during init.
73+
initial_last_step: Optional pre-fetched last step for a resumed run.
6974
auto_log_gpu: Whether to automatically log GPU metrics (utilization,
7075
memory, temperature) at regular intervals.
7176
gpu_log_interval: The interval in seconds between GPU metric logs.
@@ -88,6 +93,8 @@ def __init__(
8893
self._client = client
8994
self._space_id = space_id
9095
self.id = run_id or uuid.uuid4().hex
96+
self._existing_runs = existing_runs
97+
self._initial_last_step = initial_last_step
9198
if name is not None:
9299
self.name = name
93100
else:
@@ -182,6 +189,8 @@ def _warn_once(self, key: str, message: str) -> None:
182189
_emit_nonfatal_warning(message)
183190

184191
def _safe_get_existing_runs(self) -> list[str]:
192+
if self._existing_runs is not None:
193+
return self._existing_runs
185194
try:
186195
return SQLiteStorage.get_runs(self.project)
187196
except Exception as e:
@@ -192,6 +201,8 @@ def _safe_get_existing_runs(self) -> list[str]:
192201
return []
193202

194203
def _safe_get_max_step_for_run(self) -> int | None:
204+
if self._initial_last_step is not None:
205+
return self._initial_last_step
195206
try:
196207
return SQLiteStorage.get_max_step_for_run(
197208
self.project, self.name, run_id=self.id

0 commit comments

Comments
 (0)