1313import subprocess
1414import sys
1515import time
16- from datetime import datetime , timezone
1716from pathlib import Path
1817
1918SIMULATOR = str (Path (__file__ ).parent / "simulate_training.py" )
2019
2120
2221def run_cli (args_list ):
2322 result = subprocess .run (
24- [" trackio" ] + args_list + [ "--json" ],
23+ [sys . executable , "-m" , " trackio.cli" , * args_list , "--json" ],
2524 capture_output = True ,
2625 text = True ,
2726 )
@@ -57,12 +56,10 @@ def run_training(project, run_name, **kwargs):
5756 return result .returncode
5857
5958
60- def get_alerts (project , run_name = None , since = None ):
59+ def get_alerts (project , run_name = None ):
6160 args = ["list" , "alerts" , "--project" , project ]
6261 if run_name :
6362 args .extend (["--run" , run_name ])
64- if since :
65- args .extend (["--since" , since ])
6663 result = run_cli (args )
6764 if result and "alerts" in result :
6865 return result ["alerts" ]
@@ -91,9 +88,10 @@ def experiment_failure_recovery(project):
9188 error_alerts [0 ]["title" ] if error_alerts else "non-zero exit code"
9289 )
9390 print (f" [AGENT] Attempt { attempt } failed: { error_msg } " )
91+ prev_lr = lr
9492 lr *= 0.1
9593 print (f" [AGENT] Reducing LR to { lr } " )
96- attempts .append ({"run" : run_name , "status" : "failed" , "lr" : lr * 10 })
94+ attempts .append ({"run" : run_name , "status" : "failed" , "lr" : prev_lr })
9795 else :
9896 result = run_cli (
9997 [
@@ -127,11 +125,10 @@ def experiment_failure_recovery(project):
127125def experiment_long_monitoring (project ):
128126 print ("\n " + "=" * 60 )
129127 print ("EXPERIMENT: Long-Running Monitoring" )
130- print ("Goal: Test alert polling with --since during active training" )
128+ print ("Goal: Test alert polling with alert_id dedup during active training" )
131129 print ("=" * 60 )
132130
133131 run_name = "long-run"
134- since = datetime .now (timezone .utc ).isoformat ()
135132
136133 cmd = [
137134 sys .executable ,
@@ -154,23 +151,21 @@ def experiment_long_monitoring(project):
154151
155152 print (" [AGENT] Starting long training run in background..." )
156153 proc = subprocess .Popen (
157- cmd , stdout = subprocess .PIPE , stderr = subprocess .PIPE , text = True
154+ cmd , stdout = subprocess .PIPE , stderr = subprocess .DEVNULL , text = True
158155 )
159156
160- all_alerts = []
157+ seen_ids : set [ str ] = set ()
161158
162159 while proc .poll () is None :
163160 time .sleep (0.5 )
164- alerts = get_alerts (project , run_name , since = since )
165-
166- new_alerts = [a for a in alerts if a not in all_alerts ]
167- if new_alerts :
168- for alert in new_alerts :
169- print (
170- f" [AGENT] New alert: [{ alert .get ('level' , '?' )} ] { alert .get ('title' , '?' )} "
171- )
172- all_alerts .append (alert )
173- since = datetime .now (timezone .utc ).isoformat ()
161+ alerts = get_alerts (project , run_name )
162+ new_alerts = [a for a in alerts if a .get ("alert_id" ) not in seen_ids ]
163+ for alert in new_alerts :
164+ print (
165+ f" [AGENT] New alert: [{ alert .get ('level' , '?' )} ] { alert .get ('title' , '?' )} "
166+ )
167+ if alert .get ("alert_id" ) is not None :
168+ seen_ids .add (alert ["alert_id" ])
174169
175170 stdout , _ = proc .communicate ()
176171 print (f" [AGENT] Training finished. Exit code: { proc .returncode } " )
@@ -184,15 +179,14 @@ def experiment_long_monitoring(project):
184179EXPERIMENTS = {
185180 "failure_recovery" : experiment_failure_recovery ,
186181 "long_monitoring" : experiment_long_monitoring ,
187- "all" : None ,
188182}
189183
190184
191185def main ():
192186 parser = argparse .ArgumentParser (description = "Agent test runner for autonomous ML" )
193187 parser .add_argument (
194188 "--experiment" ,
195- choices = list ( EXPERIMENTS .keys ()) ,
189+ choices = [ * EXPERIMENTS .keys (), "all" ] ,
196190 default = "all" ,
197191 help = "Which experiment to run" ,
198192 )
@@ -203,10 +197,7 @@ def main():
203197 )
204198 args = parser .parse_args ()
205199
206- if args .experiment == "all" :
207- experiments = [k for k in EXPERIMENTS if k != "all" ]
208- else :
209- experiments = [args .experiment ]
200+ experiments = list (EXPERIMENTS ) if args .experiment == "all" else [args .experiment ]
210201
211202 results = {}
212203
0 commit comments