@@ -97,7 +97,29 @@ def _cleanup_current_run():
9797 pass
9898
9999
100- def _safe_get_runs_for_init (project : str ) -> list [str ]:
100+ def _safe_get_runs_for_init (
101+ project : str ,
102+ space_id : str | None ,
103+ resume : str ,
104+ remote_client : RemoteClient | None = None ,
105+ check_existing_for_never : bool = False ,
106+ ) -> list [str ]:
107+ if space_id is not None :
108+ if resume == "never" and not check_existing_for_never :
109+ return []
110+ try :
111+ client = remote_client or RemoteClient (
112+ space_id ,
113+ hf_token = huggingface_hub .utils .get_token (),
114+ verbose = False ,
115+ )
116+ runs = client .predict (project = project , api_name = "/get_runs_for_project" )
117+ return runs if isinstance (runs , list ) else []
118+ except Exception as e :
119+ _emit_nonfatal_warning (
120+ f"trackio.init() could not inspect existing runs for project '{ project } ' on Space '{ space_id } ': { e } . Continuing without resume metadata."
121+ )
122+ return []
101123 try :
102124 return SQLiteStorage .get_runs (project )
103125 except Exception as e :
@@ -107,7 +129,32 @@ def _safe_get_runs_for_init(project: str) -> list[str]:
107129 return []
108130
109131
110- def _safe_get_latest_run_for_init (project : str , name : str ) -> dict | None :
132+ def _safe_get_latest_run_for_init (
133+ project : str ,
134+ name : str ,
135+ space_id : str | None = None ,
136+ remote_client : RemoteClient | None = None ,
137+ ) -> dict | None :
138+ if space_id is not None :
139+ try :
140+ client = remote_client or RemoteClient (
141+ space_id ,
142+ hf_token = huggingface_hub .utils .get_token (),
143+ verbose = False ,
144+ )
145+ runs = client .predict (project = project , api_name = "/get_runs_for_project" )
146+ if not isinstance (runs , list ):
147+ return None
148+ matches = [r for r in runs if isinstance (r , dict ) and r .get ("name" ) == name ]
149+ if not matches :
150+ return None
151+ matches .sort (key = lambda r : r .get ("created_at" ) or "" , reverse = True )
152+ return matches [0 ]
153+ except Exception as e :
154+ _emit_nonfatal_warning (
155+ f"trackio.init() could not inspect existing runs for project '{ project } ' on Space '{ space_id } ': { e } . Continuing without resume metadata."
156+ )
157+ return None
111158 try :
112159 return SQLiteStorage .get_latest_run_record_by_name (project , name )
113160 except Exception as e :
@@ -117,6 +164,50 @@ def _safe_get_latest_run_for_init(project: str, name: str) -> dict | None:
117164 return None
118165
119166
167+ def _safe_get_last_step_for_init (
168+ project : str ,
169+ run_name : str ,
170+ space_id : str | None ,
171+ resumed : bool ,
172+ run_id : str | None = None ,
173+ remote_client : RemoteClient | None = None ,
174+ ) -> int | None :
175+ if not resumed :
176+ return None
177+ if space_id is not None :
178+ try :
179+ client = remote_client or RemoteClient (
180+ space_id ,
181+ hf_token = huggingface_hub .utils .get_token (),
182+ verbose = False ,
183+ )
184+ summary_kwargs : dict [str , Any ] = {
185+ "project" : project ,
186+ "api_name" : "/get_run_summary" ,
187+ }
188+ if run_id is not None :
189+ summary_kwargs ["run_id" ] = run_id
190+ else :
191+ summary_kwargs ["run" ] = run_name
192+ summary = client .predict (** summary_kwargs )
193+ if isinstance (summary , dict ):
194+ last_step = summary .get ("last_step" )
195+ return last_step if isinstance (last_step , int ) else None
196+ return None
197+ except Exception as e :
198+ _emit_nonfatal_warning (
199+ f"trackio.init() could not recover the previous step for run '{ run_name } ' on Space '{ space_id } ': { e } . Continuing from step 0."
200+ )
201+ return None
202+ try :
203+ return SQLiteStorage .get_max_step_for_run (project , run_name , run_id = run_id )
204+ except Exception as e :
205+ _emit_nonfatal_warning (
206+ f"trackio.init() could not recover the previous step for run '{ run_name } ': { e } . Continuing from step 0."
207+ )
208+ return None
209+
210+
120211def init (
121212 project : str ,
122213 name : str | None = None ,
@@ -298,8 +389,36 @@ def init(
298389 )
299390 context_vars .current_project .set (project )
300391
392+ remote_client = None
393+ if space_id is not None :
394+ try :
395+ remote_client = RemoteClient (
396+ space_id ,
397+ hf_token = huggingface_hub .utils .get_token (),
398+ verbose = False ,
399+ )
400+ except Exception as e :
401+ _emit_nonfatal_warning (
402+ f"trackio.init() could not create a Space client for '{ space_id } ': { e } . Continuing with local fallback metadata lookups."
403+ )
404+
405+ existing_run_records = _safe_get_runs_for_init (
406+ project ,
407+ space_id ,
408+ resume ,
409+ remote_client = remote_client ,
410+ check_existing_for_never = name is not None ,
411+ )
412+ existing_runs = [
413+ r ["name" ] if isinstance (r , dict ) else r for r in existing_run_records
414+ ]
415+
301416 existing_run = (
302- _safe_get_latest_run_for_init (project , name ) if name is not None else None
417+ _safe_get_latest_run_for_init (
418+ project , name , space_id = space_id , remote_client = remote_client
419+ )
420+ if name is not None
421+ else None
303422 )
304423 resolved_run_id = None
305424
@@ -319,6 +438,19 @@ def init(
319438 else :
320439 raise ValueError ("resume must be one of: 'must', 'allow', or 'never'" )
321440
441+ initial_last_step = (
442+ _safe_get_last_step_for_init (
443+ project ,
444+ name ,
445+ space_id ,
446+ resumed ,
447+ run_id = resolved_run_id ,
448+ remote_client = remote_client ,
449+ )
450+ if name is not None
451+ else None
452+ )
453+
322454 if auto_log_gpu is None :
323455 nvidia_available = gpu_available ()
324456 apple_available = apple_gpu_available ()
@@ -342,6 +474,8 @@ def init(
342474 group = group ,
343475 config = config ,
344476 space_id = space_id ,
477+ existing_runs = existing_runs ,
478+ initial_last_step = initial_last_step ,
345479 auto_log_gpu = auto_log_gpu ,
346480 gpu_log_interval = gpu_log_interval ,
347481 webhook_url = webhook_url ,
0 commit comments