Skip to content

Commit 498bbc4

Browse files
abidlabsclaudegradio-pr-bot
authored
Scope bucket sync to trackio/ subtree to avoid walking the HF cache (#506)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
1 parent 0c06dbf commit 498bbc4

4 files changed

Lines changed: 112 additions & 4 deletions

File tree

.changeset/few-bars-find.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trackio": patch
3+
---
4+
5+
feat:Scope bucket sync to trackio/ subtree to avoid walking the HF cache

trackio/__init__.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,29 @@ def _cleanup_current_run():
9797
pass
9898

9999

100-
def _safe_get_runs_for_init(project: str) -> list[str]:
100+
def _safe_get_runs_for_init(
101+
project: str,
102+
space_id: str | None,
103+
resume: str,
104+
remote_client: RemoteClient | None = None,
105+
check_existing_for_never: bool = False,
106+
) -> list[str]:
107+
if space_id is not None:
108+
if resume == "never" and not check_existing_for_never:
109+
return []
110+
try:
111+
client = remote_client or RemoteClient(
112+
space_id,
113+
hf_token=huggingface_hub.utils.get_token(),
114+
verbose=False,
115+
)
116+
runs = client.predict(project=project, api_name="/get_runs_for_project")
117+
return runs if isinstance(runs, list) else []
118+
except Exception as e:
119+
_emit_nonfatal_warning(
120+
f"trackio.init() could not inspect existing runs for project '{project}' on Space '{space_id}': {e}. Continuing without resume metadata."
121+
)
122+
return []
101123
try:
102124
return SQLiteStorage.get_runs(project)
103125
except Exception as e:
@@ -107,6 +129,43 @@ def _safe_get_runs_for_init(project: str) -> list[str]:
107129
return []
108130

109131

132+
def _safe_get_last_step_for_init(
133+
project: str,
134+
run_name: str,
135+
space_id: str | None,
136+
resumed: bool,
137+
remote_client: RemoteClient | None = None,
138+
) -> int | None:
139+
if not resumed:
140+
return None
141+
if space_id is not None:
142+
try:
143+
client = remote_client or RemoteClient(
144+
space_id,
145+
hf_token=huggingface_hub.utils.get_token(),
146+
verbose=False,
147+
)
148+
summary = client.predict(
149+
project=project, run=run_name, api_name="/get_run_summary"
150+
)
151+
if isinstance(summary, dict):
152+
last_step = summary.get("last_step")
153+
return last_step if isinstance(last_step, int) else None
154+
return None
155+
except Exception as e:
156+
_emit_nonfatal_warning(
157+
f"trackio.init() could not recover the previous step for run '{run_name}' on Space '{space_id}': {e}. Continuing from step 0."
158+
)
159+
return None
160+
try:
161+
return SQLiteStorage.get_max_step_for_run(project, run_name)
162+
except Exception as e:
163+
_emit_nonfatal_warning(
164+
f"trackio.init() could not recover the previous step for run '{run_name}': {e}. Continuing from step 0."
165+
)
166+
return None
167+
168+
110169
def init(
111170
project: str,
112171
name: str | None = None,
@@ -288,7 +347,26 @@ def init(
288347
)
289348
context_vars.current_project.set(project)
290349

291-
existing_runs = _safe_get_runs_for_init(project)
350+
remote_client = None
351+
if space_id is not None:
352+
try:
353+
remote_client = RemoteClient(
354+
space_id,
355+
hf_token=huggingface_hub.utils.get_token(),
356+
verbose=False,
357+
)
358+
except Exception as e:
359+
_emit_nonfatal_warning(
360+
f"trackio.init() could not create a Space client for '{space_id}': {e}. Continuing with local fallback metadata lookups."
361+
)
362+
363+
existing_runs = _safe_get_runs_for_init(
364+
project,
365+
space_id,
366+
resume,
367+
remote_client=remote_client,
368+
check_existing_for_never=name is not None,
369+
)
292370

293371
if resume == "must":
294372
if name is None:
@@ -310,6 +388,18 @@ def init(
310388
else:
311389
raise ValueError("resume must be one of: 'must', 'allow', or 'never'")
312390

391+
initial_last_step = (
392+
_safe_get_last_step_for_init(
393+
project,
394+
name,
395+
space_id,
396+
resumed,
397+
remote_client=remote_client,
398+
)
399+
if name is not None
400+
else None
401+
)
402+
313403
if auto_log_gpu is None:
314404
nvidia_available = gpu_available()
315405
apple_available = apple_gpu_available()
@@ -332,6 +422,8 @@ def init(
332422
group=group,
333423
config=config,
334424
space_id=space_id,
425+
existing_runs=existing_runs,
426+
initial_last_step=initial_last_step,
335427
auto_log_gpu=auto_log_gpu,
336428
gpu_log_interval=gpu_log_interval,
337429
webhook_url=webhook_url,

trackio/bucket_storage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ def _list_bucket_file_paths(bucket_id: str, prefix: str | None = None) -> list[s
2525
def download_bucket_to_trackio_dir(bucket_id: str) -> None:
2626
TRACKIO_DIR.mkdir(parents=True, exist_ok=True)
2727
sync_bucket(
28-
source=f"hf://buckets/{bucket_id}",
29-
dest=str(TRACKIO_DIR.parent),
28+
source=f"hf://buckets/{bucket_id}/trackio",
29+
dest=str(TRACKIO_DIR),
3030
quiet=True,
3131
)
3232

trackio/run.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ def __init__(
4242
group: str | None = None,
4343
config: dict | None = None,
4444
space_id: str | None = None,
45+
existing_runs: list[str] | None = None,
46+
initial_last_step: int | None = None,
4547
auto_log_gpu: bool = False,
4648
gpu_log_interval: float = 10.0,
4749
webhook_url: str | None = None,
@@ -65,6 +67,9 @@ def __init__(
6567
Keys starting with '_' are reserved for internal use.
6668
space_id: The HF Space ID if logging to a Space (e.g., "user/space").
6769
If provided, media files will be uploaded to the Space.
70+
existing_runs: Optional pre-fetched run names for this project. Used to
71+
avoid redundant storage or remote lookups during init.
72+
initial_last_step: Optional pre-fetched last step for a resumed run.
6873
auto_log_gpu: Whether to automatically log GPU metrics (utilization,
6974
memory, temperature) at regular intervals.
7075
gpu_log_interval: The interval in seconds between GPU metric logs.
@@ -86,6 +91,8 @@ def __init__(
8691
self._client_thread = None
8792
self._client = client
8893
self._space_id = space_id
94+
self._existing_runs = existing_runs
95+
self._initial_last_step = initial_last_step
8996
if name is not None:
9097
self.name = name
9198
else:
@@ -180,6 +187,8 @@ def _warn_once(self, key: str, message: str) -> None:
180187
_emit_nonfatal_warning(message)
181188

182189
def _safe_get_existing_runs(self) -> list[str]:
190+
if self._existing_runs is not None:
191+
return self._existing_runs
183192
try:
184193
return SQLiteStorage.get_runs(self.project)
185194
except Exception as e:
@@ -190,6 +199,8 @@ def _safe_get_existing_runs(self) -> list[str]:
190199
return []
191200

192201
def _safe_get_max_step_for_run(self) -> int | None:
202+
if self._initial_last_step is not None:
203+
return self._initial_last_step
193204
try:
194205
return SQLiteStorage.get_max_step_for_run(self.project, self.name)
195206
except Exception as e:

0 commit comments

Comments
 (0)