fix test harnesses

Saba9 · Saba9 · commit e58200b25c87 · 2026-05-11T18:55:04.000-07:00
diff --git a/autonomous-experiments/test_harness/agent_runner.py b/autonomous-experiments/test_harness/agent_runner.py
@@ -13,15 +13,14 @@
 import subprocess
 import sys
 import time
-from datetime import datetime, timezone
 from pathlib import Path
 
 SIMULATOR = str(Path(__file__).parent / "simulate_training.py")
 
 
 def run_cli(args_list):
     result = subprocess.run(
-        ["trackio"] + args_list + ["--json"],
+        [sys.executable, "-m", "trackio.cli", *args_list, "--json"],
         capture_output=True,
         text=True,
     )
@@ -57,12 +56,10 @@ def run_training(project, run_name, **kwargs):
     return result.returncode
 
 
-def get_alerts(project, run_name=None, since=None):
+def get_alerts(project, run_name=None):
     args = ["list", "alerts", "--project", project]
     if run_name:
         args.extend(["--run", run_name])
-    if since:
-        args.extend(["--since", since])
     result = run_cli(args)
     if result and "alerts" in result:
         return result["alerts"]
@@ -91,9 +88,10 @@ def experiment_failure_recovery(project):
                 error_alerts[0]["title"] if error_alerts else "non-zero exit code"
             )
             print(f"  [AGENT] Attempt {attempt} failed: {error_msg}")
+            prev_lr = lr
             lr *= 0.1
             print(f"  [AGENT] Reducing LR to {lr}")
-            attempts.append({"run": run_name, "status": "failed", "lr": lr * 10})
+            attempts.append({"run": run_name, "status": "failed", "lr": prev_lr})
         else:
             result = run_cli(
                 [
@@ -127,11 +125,10 @@ def experiment_failure_recovery(project):
 def experiment_long_monitoring(project):
     print("\n" + "=" * 60)
     print("EXPERIMENT: Long-Running Monitoring")
-    print("Goal: Test alert polling with --since during active training")
+    print("Goal: Test alert polling with alert_id dedup during active training")
     print("=" * 60)
 
     run_name = "long-run"
-    since = datetime.now(timezone.utc).isoformat()
 
     cmd = [
         sys.executable,
@@ -154,23 +151,21 @@ def experiment_long_monitoring(project):
 
     print("  [AGENT] Starting long training run in background...")
     proc = subprocess.Popen(
-        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True
     )
 
-    all_alerts = []
+    seen_ids: set[str] = set()
 
     while proc.poll() is None:
         time.sleep(0.5)
-        alerts = get_alerts(project, run_name, since=since)
-
-        new_alerts = [a for a in alerts if a not in all_alerts]
-        if new_alerts:
-            for alert in new_alerts:
-                print(
-                    f"  [AGENT] New alert: [{alert.get('level', '?')}] {alert.get('title', '?')}"
-                )
-                all_alerts.append(alert)
-            since = datetime.now(timezone.utc).isoformat()
+        alerts = get_alerts(project, run_name)
+        new_alerts = [a for a in alerts if a.get("alert_id") not in seen_ids]
+        for alert in new_alerts:
+            print(
+                f"  [AGENT] New alert: [{alert.get('level', '?')}] {alert.get('title', '?')}"
+            )
+            if alert.get("alert_id") is not None:
+                seen_ids.add(alert["alert_id"])
 
     stdout, _ = proc.communicate()
     print(f"  [AGENT] Training finished. Exit code: {proc.returncode}")
@@ -184,15 +179,14 @@ def experiment_long_monitoring(project):
 EXPERIMENTS = {
     "failure_recovery": experiment_failure_recovery,
     "long_monitoring": experiment_long_monitoring,
-    "all": None,
 }
 
 
 def main():
     parser = argparse.ArgumentParser(description="Agent test runner for autonomous ML")
     parser.add_argument(
         "--experiment",
-        choices=list(EXPERIMENTS.keys()),
+        choices=[*EXPERIMENTS.keys(), "all"],
         default="all",
         help="Which experiment to run",
     )
@@ -203,10 +197,7 @@ def main():
     )
     args = parser.parse_args()
 
-    if args.experiment == "all":
-        experiments = [k for k in EXPERIMENTS if k != "all"]
-    else:
-        experiments = [args.experiment]
+    experiments = list(EXPERIMENTS) if args.experiment == "all" else [args.experiment]
 
     results = {}
 
diff --git a/autonomous-experiments/test_harness/simulate_training.py b/autonomous-experiments/test_harness/simulate_training.py
@@ -9,7 +9,6 @@
 import argparse
 import math
 import random
-import sys
 import time
 
 import trackio
@@ -90,70 +89,19 @@ def main():
 
     trackio.init(project=args.project, name=args.run_name, config=config)
 
-    best_val_loss = float("inf")
-    stagnation_count = 0
+    trackio.watch("train/loss", nan=True, max_value=10.0, spike_factor=3.0, window=10)
 
     for step in range(args.steps):
         train_loss = simulate_loss(
             step, args.steps, args.lr, args.depth, args.batch_size
         )
 
-        if args.spike_at_step and step == args.spike_at_step:
+        if args.spike_at_step is not None and step == args.spike_at_step:
             train_loss *= 10.0
-            trackio.alert(
-                "Loss spike detected",
-                text=f"Loss spiked to {train_loss:.4f} at step {step}",
-                level=AlertLevel.WARN,
-            )
-
-        if math.isnan(train_loss) or math.isinf(train_loss):
-            trackio.alert(
-                "NaN/Inf loss detected",
-                text=f"Loss became {train_loss} at step {step}. Training is diverging.",
-                level=AlertLevel.ERROR,
-            )
-            trackio.log({"train/loss": train_loss, "val/loss": train_loss}, step=step)
-            trackio.finish()
-            print(f"TERMINATED EARLY: NaN/Inf loss at step {step}")
-            sys.exit(1)
 
         val_loss = simulate_val_loss(train_loss, step, args.steps, args.depth)
         accuracy = simulate_accuracy(val_loss)
 
-        if val_loss < best_val_loss:
-            best_val_loss = val_loss
-            stagnation_count = 0
-        else:
-            stagnation_count += 1
-
-        if train_loss > 10.0 and step > 50:
-            trackio.alert(
-                "Training diverging",
-                text=f"Loss {train_loss:.4f} is very high at step {step}. Learning rate may be too high.",
-                level=AlertLevel.ERROR,
-            )
-            trackio.log(
-                {
-                    "train/loss": round(train_loss, 4),
-                    "val/loss": round(val_loss, 4),
-                    "accuracy": round(accuracy, 4),
-                    "best_val_loss": round(best_val_loss, 4),
-                    "lr": args.lr,
-                },
-                step=step,
-            )
-            trackio.finish()
-            print(f"TERMINATED EARLY: diverging at step {step}")
-            sys.exit(1)
-
-        if stagnation_count >= 100 and step > 100:
-            trackio.alert(
-                "Training stagnated",
-                text=f"Val loss has not improved for {stagnation_count} steps. Best: {best_val_loss:.4f}",
-                level=AlertLevel.WARN,
-            )
-            stagnation_count = 0
-
         if val_loss > train_loss * 1.5 and step > args.steps * 0.5:
             trackio.alert(
                 "Overfitting detected",
@@ -166,22 +114,25 @@ def main():
                 "train/loss": round(train_loss, 4),
                 "val/loss": round(val_loss, 4),
                 "accuracy": round(accuracy, 4),
-                "best_val_loss": round(best_val_loss, 4),
                 "lr": args.lr,
             },
             step=step,
         )
 
+        if trackio.should_stop():
+            print(f"TERMINATED EARLY: watcher triggered stop at step {step}")
+            break
+
         if args.sleep > 0:
             time.sleep(args.sleep)
 
     trackio.alert(
         "Training complete",
-        text=f"Finished {args.steps} steps. Best val loss: {best_val_loss:.4f}",
+        text=f"Finished at step {step}.",
         level=AlertLevel.INFO,
     )
     trackio.finish()
-    print(f"Training complete. Best val loss: {best_val_loss:.4f}")
+    print(f"Training complete. Final step: {step}")
 
 
 if __name__ == "__main__":