Skip to content

Commit af23d74

Browse files
abidlabsclaudegradio-pr-bot
authored
Prevent trackio errors from crashing the user's training loop (#496)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
1 parent 46a3cc3 commit af23d74

7 files changed

Lines changed: 759 additions & 426 deletions

File tree

.changeset/upset-news-end.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trackio": minor
3+
---
4+
5+
feat:Prevent trackio errors from crashing the user's training loop

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,8 @@ To get started and see basic examples of usage, see these files:
245245

246246
`trackio.log()` is a non-blocking call that appends to an in-memory queue and returns immediately. A background thread drains the queue every **0.5 s** and writes to the local SQLite database. Because log calls never touch the network or disk on the calling thread, the client-side throughput is effectively **unlimited** -- you can burst thousands of calls per second without slowing down your training loop.
247247

248+
Trackio is written defensively so Trackio-side failures should never take down your main experiment code. Under normal usage, issues inside Trackio's logging, flushing, or delivery paths degrade to warnings and local buffering rather than exceptions from your training loop.
249+
248250
### Logging to a Hugging Face Space
249251

250252
When a `space_id` is provided, the same background thread batches queued entries and pushes them to the Space via the Gradio client API. The main factors that affect end-to-end throughput are:

docs/source/track.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ Once your run is initialized, you can start logging data using the [`log`] funct
5252
trackio.log({"loss": 0.05})
5353
```
5454

55+
Trackio is written defensively so Trackio-side failures should never take down your main experiment code. Under normal usage, issues inside Trackio's logging, flushing, or delivery paths degrade to warnings and local buffering rather than exceptions from your training loop.
56+
5557
Each call to [`log`] automatically increments the step counter.
5658
If you want to log multiple metrics at once, pass them together:
5759

@@ -336,4 +338,4 @@ for batch_size in [16, 32, 64]:
336338
trackio.finish()
337339
```
338340

339-
In the dashboard, you can then group by "learning_rate" or "batch_size" to more easily compare runs with different hyperparameters.
341+
In the dashboard, you can then group by "learning_rate" or "batch_size" to more easily compare runs with different hyperparameters.

tests/unit/test_run.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import sqlite3
12
import time
23
from unittest.mock import MagicMock
34

45
import pytest
56

6-
from trackio import Markdown, Run, init
7+
from trackio import Markdown, Run, init, utils
78
from trackio.sqlite_storage import SQLiteStorage
89

910

@@ -158,3 +159,61 @@ def test_run_group_added(temp_dir):
158159
config={"learning_rate": 0.01},
159160
)
160161
assert run.config["_Group"] == "test_group"
162+
163+
164+
def test_log_does_not_crash_on_bad_metrics(temp_dir, monkeypatch):
165+
run = Run(url=None, project="proj", client=None, name="safe-run", space_id=None)
166+
167+
original = utils.serialize_values
168+
169+
def exploding_serialize(metrics):
170+
if "bad" in metrics:
171+
raise RuntimeError("serialize boom")
172+
return original(metrics)
173+
174+
monkeypatch.setattr(utils, "serialize_values", exploding_serialize)
175+
176+
with pytest.warns(UserWarning, match="trackio.log\\(\\) failed to process metrics"):
177+
run.log({"bad": 1})
178+
179+
run.log({"loss": 0.5})
180+
run.finish()
181+
182+
logs = SQLiteStorage.get_logs("proj", "safe-run")
183+
assert len(logs) == 1
184+
assert logs[0]["loss"] == 0.5
185+
186+
187+
def test_init_survives_storage_read_failures(temp_dir, monkeypatch):
188+
def raise_db_error(*args, **kwargs):
189+
raise sqlite3.DatabaseError("database disk image is malformed")
190+
191+
monkeypatch.setattr(SQLiteStorage, "get_runs", raise_db_error)
192+
monkeypatch.setattr(SQLiteStorage, "get_max_step_for_run", raise_db_error)
193+
194+
with pytest.warns(UserWarning) as record:
195+
run = init(project="broken-project", name="safe-run")
196+
197+
messages = [str(item.message) for item in record]
198+
assert any("could not inspect existing runs" in message for message in messages)
199+
assert any("could not recover the previous step" in message for message in messages)
200+
assert isinstance(run, Run)
201+
assert run.name == "safe-run"
202+
assert run._next_step == 0
203+
204+
run.log({"loss": 0.5})
205+
run.finish()
206+
207+
208+
def test_local_flush_failure_does_not_crash(temp_dir, monkeypatch):
209+
run = Run(url=None, project="proj", client=None, name="safe-run", space_id=None)
210+
211+
def raise_db_error(*args, **kwargs):
212+
raise sqlite3.DatabaseError("database disk image is malformed")
213+
214+
monkeypatch.setattr(SQLiteStorage, "bulk_log", raise_db_error)
215+
216+
run.log({"loss": 0.5})
217+
218+
with pytest.warns(UserWarning, match="trackio failed to flush metric logs"):
219+
run.finish()

trackio/__init__.py

Lines changed: 58 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from trackio.sqlite_storage import SQLiteStorage
4040
from trackio.table import Table
4141
from trackio.typehints import UploadEntry
42-
from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR
42+
from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR, _emit_nonfatal_warning
4343

4444
logging.getLogger("httpx").setLevel(logging.WARNING)
4545

@@ -98,6 +98,16 @@ def _cleanup_current_run():
9898
pass
9999

100100

101+
def _safe_get_runs_for_init(project: str) -> list[str]:
102+
try:
103+
return SQLiteStorage.get_runs(project)
104+
except Exception as e:
105+
_emit_nonfatal_warning(
106+
f"trackio.init() could not inspect existing runs for project '{project}': {e}. Continuing without resume metadata."
107+
)
108+
return []
109+
110+
101111
def init(
102112
project: str,
103113
name: str | None = None,
@@ -195,7 +205,7 @@ def init(
195205
`Run`: A [`Run`] object that can be used to log metrics and finish the run.
196206
"""
197207
if settings is not None:
198-
warnings.warn(
208+
_emit_nonfatal_warning(
199209
"* Warning: settings is not used. Provided for compatibility with wandb.init(). Please create an issue at: https://github.com/gradio-app/trackio/issues if you need a specific feature implemented."
200210
)
201211

@@ -215,7 +225,7 @@ def init(
215225
) from e
216226

217227
if space_id is None and bucket_id is not None:
218-
warnings.warn(
228+
_emit_nonfatal_warning(
219229
"trackio.init() has `bucket_id` set but `space_id` is None: metrics will be logged "
220230
"locally only. Pass `space_id` to create or use a Hugging Face Space, which will be "
221231
"attached to the Hugging Face Bucket.",
@@ -259,32 +269,39 @@ def init(
259269
if not _should_embed_local:
260270
utils.print_dashboard_instructions(project)
261271
else:
262-
deploy.create_space_if_not_exists(
263-
space_id,
264-
space_storage,
265-
dataset_id,
266-
bucket_id,
267-
private,
268-
)
269-
user_name, space_name = space_id.split("/")
270-
space_url = deploy.SPACE_HOST_URL.format(
271-
user_name=user_name, space_name=space_name
272-
)
273-
if utils.is_in_notebook() and embed:
274-
utils.embed_url_in_notebook(space_url)
272+
try:
273+
deploy.create_space_if_not_exists(
274+
space_id,
275+
space_storage,
276+
dataset_id,
277+
bucket_id,
278+
private,
279+
)
280+
user_name, space_name = space_id.split("/")
281+
space_url = deploy.SPACE_HOST_URL.format(
282+
user_name=user_name, space_name=space_name
283+
)
284+
if utils.is_in_notebook() and embed:
285+
utils.embed_url_in_notebook(space_url)
286+
except Exception as e:
287+
_emit_nonfatal_warning(
288+
f"trackio.init() could not prepare Space '{space_id}': {e}. Logging will continue in local fallback mode until the Space is reachable."
289+
)
275290
context_vars.current_project.set(project)
276291

292+
existing_runs = _safe_get_runs_for_init(project)
293+
277294
if resume == "must":
278295
if name is None:
279296
raise ValueError("Must provide a run name when resume='must'")
280-
if name not in SQLiteStorage.get_runs(project):
297+
if name not in existing_runs:
281298
raise ValueError(f"Run '{name}' does not exist in project '{project}'")
282299
resumed = True
283300
elif resume == "allow":
284-
resumed = name is not None and name in SQLiteStorage.get_runs(project)
301+
resumed = name is not None and name in existing_runs
285302
elif resume == "never":
286-
if name is not None and name in SQLiteStorage.get_runs(project):
287-
warnings.warn(
303+
if name is not None and name in existing_runs:
304+
_emit_nonfatal_warning(
288305
f"* Warning: resume='never' but a run '{name}' already exists in "
289306
f"project '{project}'. Generating a new name and instead. If you want "
290307
"to resume this run, call init() with resume='must' or resume='allow'."
@@ -323,9 +340,19 @@ def init(
323340
)
324341

325342
if space_id is not None:
326-
SQLiteStorage.set_project_metadata(project, "space_id", space_id)
327-
if SQLiteStorage.has_pending_data(project):
328-
run._has_local_buffer = True
343+
try:
344+
SQLiteStorage.set_project_metadata(project, "space_id", space_id)
345+
except Exception as e:
346+
_emit_nonfatal_warning(
347+
f"trackio.init() could not persist Space metadata for project '{project}': {e}. Logging will continue."
348+
)
349+
try:
350+
if SQLiteStorage.has_pending_data(project):
351+
run._has_local_buffer = True
352+
except Exception as e:
353+
_emit_nonfatal_warning(
354+
f"trackio.init() could not inspect pending buffered data for project '{project}': {e}. Logging will continue."
355+
)
329356

330357
global _atexit_registered
331358
if not _atexit_registered:
@@ -341,7 +368,12 @@ def init(
341368
globals()["config"] = run.config
342369

343370
if _should_embed_local:
344-
show(project=project, open_browser=False, block_thread=False)
371+
try:
372+
show(project=project, open_browser=False, block_thread=False)
373+
except Exception as e:
374+
_emit_nonfatal_warning(
375+
f"trackio.init() could not auto-launch the dashboard: {e}. Logging will continue."
376+
)
345377

346378
return run
347379

@@ -415,7 +447,7 @@ def log_gpu(run: Run | None = None, device: int | None = None) -> dict:
415447
elif apple_gpu_available():
416448
return _log_apple_gpu(run=run)
417449
else:
418-
warnings.warn(
450+
_emit_nonfatal_warning(
419451
"No GPU detected. Install nvidia-ml-py for NVIDIA GPU support "
420452
"or psutil for Apple Silicon support."
421453
)
@@ -636,7 +668,7 @@ def save(
636668
hf_token=huggingface_hub.utils.get_token(),
637669
)
638670
except Exception as e:
639-
warnings.warn(
671+
_emit_nonfatal_warning(
640672
f"Failed to upload files: {e}. "
641673
"Files may not be available in the dashboard."
642674
)

0 commit comments

Comments
 (0)