misc fixes

Saba9 · Saba9 · commit 73f17d0690f8 · 2026-05-11T16:03:10.000-07:00
diff --git a/.changeset/every-spoons-smash.md b/.changeset/every-spoons-smash.md
@@ -2,4 +2,11 @@
 "trackio": minor
 ---
 
-feat:Add additional support for autonomous ML experiments
+feat: Add additional support for autonomous ML experiments
+
+- `trackio.watch()` / `trackio.should_stop()`: register metric watchers (NaN/Inf, threshold, spike, stagnation, custom fn) that fire alerts automatically on every `trackio.log()` call
+- `AlertReason` constants for programmatic alert filtering
+- Run lifecycle status tracking (`running` → `finished` / `failed`) persisted in SQLite
+- New CLI commands: `trackio best`, `trackio compare`, `trackio summary`
+- `Run.status`, `Run.final_metrics`, `Run.metrics()`, `Run.history()` on the Python API
+- `alerts.data` column (SQL migration) for structured alert metadata
diff --git a/docs/source/alerts.md b/docs/source/alerts.md
@@ -73,7 +73,7 @@ Watcher-generated alerts are stored, displayed in the dashboard, and delivered t
 |---|---|---|---|
 | `metric` | `str` | *(required)* | The metric name to watch (e.g., `"train/loss"`). |
 | `nan` | `bool` | `True` | Fire an ERROR alert if the value becomes NaN or Inf. |
-| `spike_factor` | `float \| None` | `None` | Fire a WARN alert when the value deviates from the recent moving average by this factor (e.g., `3.0` = 3× the average). |
+| `spike_factor` | `float \| None` | `None` | Fire a WARN alert when `\|value − recent_avg\| > (spike_factor − 1) × \|recent_avg\|` (e.g., `3.0` triggers when the deviation exceeds 2× `\|avg\|`). Symmetric — drops trigger too. |
 | `patience` | `int \| None` | `None` | Fire a WARN alert if no improvement is seen for this many log steps. Also sets `should_stop()` to `True`. |
 | `min_delta` | `float` | `0.0` | Minimum change to count as an improvement (used with `patience`). |
 | `max_value` | `float \| None` | `None` | Fire an ERROR alert if the value exceeds this threshold. Also sets `should_stop()` to `True`. |
@@ -93,7 +93,7 @@ trackio.watch("train/loss", nan=True)
 
 #### Max / Min Thresholds
 
-`max_value` fires an **ERROR** alert (and stops) when the metric exceeds the threshold. `min_value` fires a **WARN** alert when it falls below. Each alert fires once when the threshold is crossed and resets if the value recovers.
+`max_value` fires an **ERROR** alert (and stops) when the metric exceeds the threshold. `min_value` fires a **WARN** alert when it falls below, but — unlike `max_value` — does **not** set `should_stop()`. Each alert fires once when the threshold is crossed and resets if the value recovers.
 
 ```python
 trackio.watch("train/loss", max_value=20.0)
@@ -102,7 +102,7 @@ trackio.watch("val/accuracy", min_value=0.5)
 
 #### Spike Detection
 
-Fires a **WARN** alert when the value deviates from the recent moving average by more than `(spike_factor - 1) × avg`. The alert resets automatically once the value returns to normal.
+Fires a **WARN** alert when the value deviates from the recent moving average by more than `(spike_factor - 1) × |recent_avg|` — that is, when `|value − recent_avg| > (spike_factor − 1) × |recent_avg|`. Detection is symmetric: sudden drops trigger the alert in addition to sudden rises. With `spike_factor=3.0` and a recent average of `1.0`, the alert fires once `|value − 1.0| > 2.0`. The alert resets automatically once the value returns to normal.
 
 ```python
 trackio.watch("train/loss", spike_factor=3.0, window=10)
@@ -118,7 +118,7 @@ trackio.watch("val/accuracy", patience=50, min_delta=0.001, mode="max")
 
 ### Early Stopping
 
-[`should_stop`] returns `True` if any watcher has triggered a stop condition (NaN/Inf, `max_value` exceeded, or `patience` exhausted):
+[`should_stop`] returns `True` if any watcher has triggered a stop condition (NaN/Inf, `max_value` exceeded, `patience` exhausted, or a custom watcher returned `{"stop": True}`):
 
 ```python
 for step in range(1000):
diff --git a/trackio/__init__.py b/trackio/__init__.py
@@ -684,7 +684,7 @@ def init(
     globals()["config"] = run.config
     if _watcher_manager._watchers:
         _emit_nonfatal_warning(
-            "trackio.init() cleared existing metric watchers. Call trackio.watch() after trackio.init()."
+            "trackio.init() will clear existing metric watchers. Call trackio.watch() after trackio.init()."
         )
     _watcher_manager.clear()
 
@@ -835,6 +835,7 @@ def alert(
     run.alert(title=title, text=text, level=level, webhook_url=webhook_url, data=data)
 
 
+# Not thread-safe: concurrent trackio.log() from multiple threads may race on watcher state.
 _watcher_manager = WatcherManager()
 
 
@@ -852,17 +853,19 @@ def watch(
 ) -> None:
     """
     Register a metric watcher that automatically fires alerts when conditions
-    are met during ``trackio.log()`` calls. Must be called after
-    ``trackio.init()`` — watchers are cleared when a new run starts.
+    are met during ``trackio.log()`` calls. Typically called after
+    ``trackio.init()`` — watchers registered earlier will persist until the
+    next ``trackio.init()`` clears them.
 
     Args:
         metric (`str`):
             The metric name to watch (e.g., ``"train/loss"``).
         nan (`bool`, *optional*, defaults to `True`):
             Fire an ERROR alert if the metric becomes NaN or Inf.
         spike_factor (`float`, *optional*):
-            Fire a WARN alert if the value exceeds the recent moving average
-            by this factor (e.g., ``3.0`` means 3x the recent average).
+            Fire a WARN alert if the absolute deviation from the recent moving
+            average exceeds ``(spike_factor - 1) * |recent_avg|`` (e.g.,
+            ``3.0`` triggers when ``|value - avg| > 2 * |avg|``).
         patience (`int`, *optional*):
             Fire a WARN alert if no improvement is seen for this many log
             steps. Also sets ``should_stop()`` to True.
@@ -877,14 +880,18 @@ def watch(
             Number of recent values to use for spike detection averaging.
         mode (`str`, *optional*, defaults to ``"min"``):
             Whether lower (``"min"``) or higher (``"max"``) values are better.
-            Affects patience-based stagnation detection.
+            Must be ``"min"`` or ``"max"``. Affects patience-based stagnation
+            detection.
         fn (`Callable[[float, int | None], bool | list[dict] | None]`, *optional*):
             A custom condition called as ``fn(value, step)`` on every
-            ``trackio.log()`` call. Return ``True`` to fire a default WARN
-            alert, a list of alert dicts for full control, or a falsy value
-            for no alert. Include ``"stop": True`` in a returned dict to
-            also set ``should_stop()`` to ``True``.
+            ``trackio.log()`` call (where ``value`` is the most recent metric
+            value and ``step`` is the log step or ``None``). Return ``True``
+            to fire a default WARN alert, a list of alert dicts for full
+            control, or a falsy value for no alert. Include ``"stop": True``
+            in a returned dict to also set ``should_stop()`` to ``True``.
     """
+    if mode not in ("min", "max"):
+        raise ValueError(f"trackio.watch(): mode={mode!r}; expected 'min' or 'max'.")
     watcher = MetricWatcher(
         metric_name=metric,
         nan=nan,
diff --git a/trackio/cli.py b/trackio/cli.py
@@ -150,7 +150,10 @@ def _handle_config(args):
 
 
 def _extract_reports(
-    run: str, logs: list[dict], report_name: str | None = None
+    run: str,
+    logs: list[dict],
+    report_name: str | None = None,
+    run_id: str | None = None,
 ) -> list[dict]:
     reports = []
     for log in logs:
@@ -165,6 +168,7 @@ def _extract_reports(
                     reports.append(
                         {
                             "run": run,
+                            "run_id": run_id,
                             "report": key,
                             "step": step,
                             "timestamp": timestamp,
@@ -887,9 +891,16 @@ def main():
         if trailing_globals.hf_token is not None:
             args.hf_token = trailing_globals.hf_token
 
-    if args.command in ("show", "status", "sync", "freeze", "skills") and _get_space(
-        args
-    ):
+    if args.command in (
+        "show",
+        "status",
+        "sync",
+        "freeze",
+        "skills",
+        "best",
+        "compare",
+        "summary",
+    ) and _get_space(args):
         error_exit(
             f"The '{args.command}' command does not support --space (remote mode)."
         )
@@ -1064,21 +1075,33 @@ def main():
                 run_records = remote.predict(
                     args.project, api_name="/get_runs_for_project"
                 )
-                runs = [r["name"] if isinstance(r, dict) else r for r in run_records]
+                records = [
+                    r if isinstance(r, dict) else {"name": r, "id": r}
+                    for r in run_records
+                ]
             else:
                 _require_project(args.project)
-                runs = SQLiteStorage.get_runs(args.project)
-            if args.run and args.run not in runs:
+                records = SQLiteStorage.get_run_records(args.project)
+
+            run_names = [r["name"] for r in records]
+            if args.run and args.run not in run_names:
                 error_exit(f"Run '{args.run}' not found in project '{args.project}'.")
 
-            target_runs = [args.run] if args.run else runs
+            target_records = (
+                [r for r in records if r["name"] == args.run] if args.run else records
+            )
+            target_names = [r["name"] for r in target_records]
+            has_dupes = len(target_names) != len(set(target_names))
+
             all_reports = []
-            for run_name in target_runs:
+            for rec in target_records:
+                run_name = rec["name"]
+                run_id = rec.get("id")
                 if remote:
                     logs = remote.predict(args.project, run_name, api_name="/get_logs")
                 else:
-                    logs = SQLiteStorage.get_logs(args.project, run_name)
-                all_reports.extend(_extract_reports(run_name, logs))
+                    logs = SQLiteStorage.get_logs(args.project, run_name, run_id=run_id)
+                all_reports.extend(_extract_reports(run_name, logs, run_id=run_id))
 
             if args.json:
                 print(
@@ -1091,10 +1114,14 @@ def main():
                     )
                 )
             else:
-                report_lines = [
-                    f"{entry['run']} | {entry['report']} | step={entry['step']} | {entry['timestamp']}"
-                    for entry in all_reports
-                ]
+                report_lines = []
+                for entry in all_reports:
+                    label = entry["run"]
+                    if has_dupes and entry.get("run_id"):
+                        label += f" ({entry['run_id'][:8]})"
+                    report_lines.append(
+                        f"{label} | {entry['report']} | step={entry['step']} | {entry['timestamp']}"
+                    )
                 if args.run:
                     print(
                         format_list(
diff --git a/trackio/cli_helpers.py b/trackio/cli_helpers.py
@@ -190,6 +190,11 @@ def format_compare(
         f"Comparing {len(comparison)} runs across {len(metric_names)} metrics\n"
     )
 
+    max_run_name_w = 40
+    for e in comparison:
+        if len(e["run"]) > max_run_name_w:
+            e["run"] = e["run"][: max_run_name_w - 1] + "…"
+
     run_w = max((len(e["run"]) for e in comparison), default=3)
     run_w = max(run_w, 3)
     status_w = 10