Prevent trackio errors from crashing the user's training loop (#496)

abidlabs · claude · gradio-pr-bot · web-flow · commit af23d74438b1 · 2026-04-15T14:20:38.000-07:00
Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
Co-authored-by: gradio-pr-bot &lt;gradio-pr-bot@users.noreply.github.com&gt;
diff --git a/.changeset/upset-news-end.md b/.changeset/upset-news-end.md
@@ -0,0 +1,5 @@
+---
+"trackio": minor
+---
+
+feat:Prevent trackio errors from crashing the user's training loop
diff --git a/README.md b/README.md
@@ -245,6 +245,8 @@ To get started and see basic examples of usage, see these files:
 
 `trackio.log()` is a non-blocking call that appends to an in-memory queue and returns immediately. A background thread drains the queue every **0.5 s** and writes to the local SQLite database. Because log calls never touch the network or disk on the calling thread, the client-side throughput is effectively **unlimited** -- you can burst thousands of calls per second without slowing down your training loop.
 
+Trackio is written defensively so Trackio-side failures should never take down your main experiment code. Under normal usage, issues inside Trackio's logging, flushing, or delivery paths degrade to warnings and local buffering rather than exceptions from your training loop.
+
 ### Logging to a Hugging Face Space
 
 When a `space_id` is provided, the same background thread batches queued entries and pushes them to the Space via the Gradio client API. The main factors that affect end-to-end throughput are:
diff --git a/docs/source/track.md b/docs/source/track.md
@@ -52,6 +52,8 @@ Once your run is initialized, you can start logging data using the [`log`] funct
 trackio.log({"loss": 0.05})
 ```
 
+Trackio is written defensively so Trackio-side failures should never take down your main experiment code. Under normal usage, issues inside Trackio's logging, flushing, or delivery paths degrade to warnings and local buffering rather than exceptions from your training loop.
+
 Each call to [`log`] automatically increments the step counter.
 If you want to log multiple metrics at once, pass them together:
 
@@ -336,4 +338,4 @@ for batch_size in [16, 32, 64]:
         trackio.finish()
 ```
 
-In the dashboard, you can then group by "learning_rate" or "batch_size" to more easily compare runs with different hyperparameters.
+In the dashboard, you can then group by "learning_rate" or "batch_size" to more easily compare runs with different hyperparameters.
diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py
@@ -1,9 +1,10 @@
+import sqlite3
 import time
 from unittest.mock import MagicMock
 
 import pytest
 
-from trackio import Markdown, Run, init
+from trackio import Markdown, Run, init, utils
 from trackio.sqlite_storage import SQLiteStorage
 
 
@@ -158,3 +159,61 @@ def test_run_group_added(temp_dir):
         config={"learning_rate": 0.01},
     )
     assert run.config["_Group"] == "test_group"
+
+
+def test_log_does_not_crash_on_bad_metrics(temp_dir, monkeypatch):
+    run = Run(url=None, project="proj", client=None, name="safe-run", space_id=None)
+
+    original = utils.serialize_values
+
+    def exploding_serialize(metrics):
+        if "bad" in metrics:
+            raise RuntimeError("serialize boom")
+        return original(metrics)
+
+    monkeypatch.setattr(utils, "serialize_values", exploding_serialize)
+
+    with pytest.warns(UserWarning, match="trackio.log\\(\\) failed to process metrics"):
+        run.log({"bad": 1})
+
+    run.log({"loss": 0.5})
+    run.finish()
+
+    logs = SQLiteStorage.get_logs("proj", "safe-run")
+    assert len(logs) == 1
+    assert logs[0]["loss"] == 0.5
+
+
+def test_init_survives_storage_read_failures(temp_dir, monkeypatch):
+    def raise_db_error(*args, **kwargs):
+        raise sqlite3.DatabaseError("database disk image is malformed")
+
+    monkeypatch.setattr(SQLiteStorage, "get_runs", raise_db_error)
+    monkeypatch.setattr(SQLiteStorage, "get_max_step_for_run", raise_db_error)
+
+    with pytest.warns(UserWarning) as record:
+        run = init(project="broken-project", name="safe-run")
+
+    messages = [str(item.message) for item in record]
+    assert any("could not inspect existing runs" in message for message in messages)
+    assert any("could not recover the previous step" in message for message in messages)
+    assert isinstance(run, Run)
+    assert run.name == "safe-run"
+    assert run._next_step == 0
+
+    run.log({"loss": 0.5})
+    run.finish()
+
+
+def test_local_flush_failure_does_not_crash(temp_dir, monkeypatch):
+    run = Run(url=None, project="proj", client=None, name="safe-run", space_id=None)
+
+    def raise_db_error(*args, **kwargs):
+        raise sqlite3.DatabaseError("database disk image is malformed")
+
+    monkeypatch.setattr(SQLiteStorage, "bulk_log", raise_db_error)
+
+    run.log({"loss": 0.5})
+
+    with pytest.warns(UserWarning, match="trackio failed to flush metric logs"):
+        run.finish()
diff --git a/trackio/__init__.py b/trackio/__init__.py
@@ -39,7 +39,7 @@
 from trackio.sqlite_storage import SQLiteStorage
 from trackio.table import Table
 from trackio.typehints import UploadEntry
-from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR
+from trackio.utils import TRACKIO_DIR, TRACKIO_LOGO_DIR, _emit_nonfatal_warning
 
 logging.getLogger("httpx").setLevel(logging.WARNING)
 
@@ -98,6 +98,16 @@ def _cleanup_current_run():
             pass
 
 
+def _safe_get_runs_for_init(project: str) -> list[str]:
+    try:
+        return SQLiteStorage.get_runs(project)
+    except Exception as e:
+        _emit_nonfatal_warning(
+            f"trackio.init() could not inspect existing runs for project '{project}': {e}. Continuing without resume metadata."
+        )
+        return []
+
+
 def init(
     project: str,
     name: str | None = None,
@@ -195,7 +205,7 @@ def init(
         `Run`: A [`Run`] object that can be used to log metrics and finish the run.
     """
     if settings is not None:
-        warnings.warn(
+        _emit_nonfatal_warning(
             "* Warning: settings is not used. Provided for compatibility with wandb.init(). Please create an issue at: https://github.com/gradio-app/trackio/issues if you need a specific feature implemented."
         )
 
@@ -215,7 +225,7 @@ def init(
         ) from e
 
     if space_id is None and bucket_id is not None:
-        warnings.warn(
+        _emit_nonfatal_warning(
             "trackio.init() has `bucket_id` set but `space_id` is None: metrics will be logged "
             "locally only. Pass `space_id` to create or use a Hugging Face Space, which will be "
             "attached to the Hugging Face Bucket.",
@@ -259,32 +269,39 @@ def init(
             if not _should_embed_local:
                 utils.print_dashboard_instructions(project)
         else:
-            deploy.create_space_if_not_exists(
-                space_id,
-                space_storage,
-                dataset_id,
-                bucket_id,
-                private,
-            )
-            user_name, space_name = space_id.split("/")
-            space_url = deploy.SPACE_HOST_URL.format(
-                user_name=user_name, space_name=space_name
-            )
-            if utils.is_in_notebook() and embed:
-                utils.embed_url_in_notebook(space_url)
+            try:
+                deploy.create_space_if_not_exists(
+                    space_id,
+                    space_storage,
+                    dataset_id,
+                    bucket_id,
+                    private,
+                )
+                user_name, space_name = space_id.split("/")
+                space_url = deploy.SPACE_HOST_URL.format(
+                    user_name=user_name, space_name=space_name
+                )
+                if utils.is_in_notebook() and embed:
+                    utils.embed_url_in_notebook(space_url)
+            except Exception as e:
+                _emit_nonfatal_warning(
+                    f"trackio.init() could not prepare Space '{space_id}': {e}. Logging will continue in local fallback mode until the Space is reachable."
+                )
     context_vars.current_project.set(project)
 
+    existing_runs = _safe_get_runs_for_init(project)
+
     if resume == "must":
         if name is None:
             raise ValueError("Must provide a run name when resume='must'")
-        if name not in SQLiteStorage.get_runs(project):
+        if name not in existing_runs:
             raise ValueError(f"Run '{name}' does not exist in project '{project}'")
         resumed = True
     elif resume == "allow":
-        resumed = name is not None and name in SQLiteStorage.get_runs(project)
+        resumed = name is not None and name in existing_runs
     elif resume == "never":
-        if name is not None and name in SQLiteStorage.get_runs(project):
-            warnings.warn(
+        if name is not None and name in existing_runs:
+            _emit_nonfatal_warning(
                 f"* Warning: resume='never' but a run '{name}' already exists in "
                 f"project '{project}'. Generating a new name and instead. If you want "
                 "to resume this run, call init() with resume='must' or resume='allow'."
@@ -323,9 +340,19 @@ def init(
     )
 
     if space_id is not None:
-        SQLiteStorage.set_project_metadata(project, "space_id", space_id)
-        if SQLiteStorage.has_pending_data(project):
-            run._has_local_buffer = True
+        try:
+            SQLiteStorage.set_project_metadata(project, "space_id", space_id)
+        except Exception as e:
+            _emit_nonfatal_warning(
+                f"trackio.init() could not persist Space metadata for project '{project}': {e}. Logging will continue."
+            )
+        try:
+            if SQLiteStorage.has_pending_data(project):
+                run._has_local_buffer = True
+        except Exception as e:
+            _emit_nonfatal_warning(
+                f"trackio.init() could not inspect pending buffered data for project '{project}': {e}. Logging will continue."
+            )
 
     global _atexit_registered
     if not _atexit_registered:
@@ -341,7 +368,12 @@ def init(
     globals()["config"] = run.config
 
     if _should_embed_local:
-        show(project=project, open_browser=False, block_thread=False)
+        try:
+            show(project=project, open_browser=False, block_thread=False)
+        except Exception as e:
+            _emit_nonfatal_warning(
+                f"trackio.init() could not auto-launch the dashboard: {e}. Logging will continue."
+            )
 
     return run
 
@@ -415,7 +447,7 @@ def log_gpu(run: Run | None = None, device: int | None = None) -> dict:
     elif apple_gpu_available():
         return _log_apple_gpu(run=run)
     else:
-        warnings.warn(
+        _emit_nonfatal_warning(
             "No GPU detected. Install nvidia-ml-py for NVIDIA GPU support "
             "or psutil for Apple Silicon support."
         )
@@ -636,7 +668,7 @@ def save(
                     hf_token=huggingface_hub.utils.get_token(),
                 )
             except Exception as e:
-                warnings.warn(
+                _emit_nonfatal_warning(
                     f"Failed to upload files: {e}. "
                     "Files may not be available in the dashboard."
                 )
diff --git a/trackio/run.py b/trackio/run.py
diff --git a/trackio/utils.py b/trackio/utils.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"trackio": minor
 +---
++
 +feat:Prevent trackio errors from crashing the user's training loop