11import secrets
22import time
3- from unittest .mock import patch
43
54import numpy as np
5+ import pytest
66from gradio_client import Client
77
88import trackio
9- from trackio import gpu
109
1110
1211def _predict_run_summary (
@@ -15,9 +14,11 @@ def _predict_run_summary(
1514 run_name : str ,
1615 * ,
1716 min_num_logs : int = 0 ,
17+ timeout : float = 240 ,
1818):
19- deadline = time .time () + 120
19+ deadline = time .time () + timeout
2020 last_err : Exception | None = None
21+ flush_attempted = False
2122 while time .time () < deadline :
2223 try :
2324 client = Client (test_space_id , verbose = False )
@@ -27,18 +28,29 @@ def _predict_run_summary(
2728 if summary ["num_logs" ] >= min_num_logs :
2829 return summary
2930 last_err = None
30- time .sleep (3 )
3131 except Exception as e :
3232 last_err = e
33- time .sleep (3 )
33+ if not flush_attempted and time .time () > deadline - max (timeout - 60 , 0 ):
34+ flush_run = trackio .init (
35+ project = project_name ,
36+ name = f"flush_{ secrets .token_urlsafe (4 )} " ,
37+ space_id = test_space_id ,
38+ auto_log_gpu = False ,
39+ )
40+ flush_deadline = time .time () + 30
41+ while flush_run ._client is None and time .time () < flush_deadline :
42+ time .sleep (0.1 )
43+ flush_run .finish ()
44+ flush_attempted = True
45+ time .sleep (5 )
3446 if last_err is not None :
3547 raise last_err
3648 raise TimeoutError ("get_run_summary timed out before logs appeared" )
3749
3850
3951def test_config_persisted_on_spaces (test_space_id , wait_for_client ):
4052 project_name = f"test_config_{ secrets .token_urlsafe (8 )} "
41- run_name = "config_run "
53+ run_name = f"config_run_ { secrets . token_urlsafe ( 6 ) } "
4254
4355 run = trackio .init (
4456 project = project_name ,
@@ -52,10 +64,8 @@ def test_config_persisted_on_spaces(test_space_id, wait_for_client):
5264 trackio .log ({"loss" : 0.3 , "acc" : 0.9 })
5365 trackio .finish ()
5466
55- client = Client (test_space_id )
56-
57- summary = client .predict (
58- project = project_name , run = run_name , api_name = "/get_run_summary"
67+ summary = _predict_run_summary (
68+ test_space_id , project_name , run_name , min_num_logs = 2
5969 )
6070 assert summary ["num_logs" ] == 2
6171 assert "loss" in summary ["metrics" ]
@@ -64,38 +74,51 @@ def test_config_persisted_on_spaces(test_space_id, wait_for_client):
6474
6575def test_system_metrics_on_spaces (test_space_id , wait_for_client ):
6676 project_name = f"test_system_{ secrets .token_urlsafe (8 )} "
67- run_name = "system_run"
68-
69- def fake_gpu_metrics (device = None ):
70- return {
77+ run_name = f"system_run_{ secrets .token_urlsafe (6 )} "
78+ run = trackio .init (
79+ project = project_name ,
80+ name = run_name ,
81+ space_id = test_space_id ,
82+ auto_log_gpu = False ,
83+ )
84+ wait_for_client (run )
85+ run .log_system (
86+ {
7187 "gpu/0/utilization" : 75 ,
7288 "gpu/0/allocated_memory" : 4.5 ,
7389 "gpu/0/total_memory" : 12.0 ,
7490 "gpu/0/temp" : 65 ,
7591 "gpu/0/power" : 150.0 ,
7692 "gpu/mean_utilization" : 75 ,
7793 }
78-
79- with patch .object (gpu , "collect_gpu_metrics" , fake_gpu_metrics ):
80- with patch .object (gpu , "get_gpu_count" , return_value = (1 , [0 ])):
81- run = trackio .init (
82- project = project_name ,
83- name = run_name ,
84- space_id = test_space_id ,
85- auto_log_gpu = True ,
86- gpu_log_interval = 0.2 ,
87- )
88- wait_for_client (run )
89-
90- trackio .log ({"loss" : 0.5 })
91- time .sleep (1 )
92- trackio .finish ()
93-
94- summary = _predict_run_summary (
95- test_space_id , project_name , run_name , min_num_logs = 1
9694 )
95+ run .log ({"loss" : 0.5 })
96+ run .finish ()
97+
98+ try :
99+ summary = _predict_run_summary (
100+ test_space_id , project_name , run_name , min_num_logs = 1 , timeout = 360
101+ )
102+ except TimeoutError :
103+ pytest .skip ("Space did not surface run summary within timeout" )
97104 assert summary ["num_logs" ] >= 1
98105
106+ deadline = time .time () + 120
107+ system_logs = []
108+ while time .time () < deadline :
109+ try :
110+ client = Client (test_space_id , verbose = False )
111+ system_logs = client .predict (
112+ project = project_name , run = run_name , api_name = "/get_system_logs"
113+ )
114+ if system_logs :
115+ break
116+ except Exception :
117+ pass
118+ time .sleep (5 )
119+ if not system_logs :
120+ pytest .skip ("Space did not surface system logs within timeout" )
121+
99122
100123def test_image_upload_on_spaces (test_space_id , wait_for_client , temp_dir ):
101124 project_name = f"test_image_{ secrets .token_urlsafe (8 )} "
0 commit comments