Skip to content

Commit 46a3cc3

Browse files
Fix/remove flaky E2E space tests (#485)
Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
1 parent e8a897d commit 46a3cc3

7 files changed

Lines changed: 184 additions & 20 deletions

File tree

.changeset/khaki-rockets-cough.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"trackio": minor
3+
---
4+
5+
feat:Fix/remove flaky E2E space tests

tests/e2e-spaces/conftest.py

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import os
22
import time
33

4+
import huggingface_hub
45
import pytest
56
from gradio_client import Client
7+
from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
68

79
from trackio import deploy, utils
810

@@ -12,6 +14,7 @@ def test_space_id():
1214
space_id = os.environ.get("TEST_SPACE_ID")
1315
if not space_id:
1416
pytest.skip("TEST_SPACE_ID environment variable not set")
17+
space_id, _, _ = utils.preprocess_space_and_dataset_ids(space_id, None)
1518
return space_id
1619

1720

@@ -20,16 +23,64 @@ def _ensure_space_ready(test_space_id):
2023
space_id, dataset_id, bucket_id = utils.preprocess_space_and_dataset_ids(
2124
test_space_id, None
2225
)
26+
27+
_reset_test_space(space_id)
2328
deploy.create_space_if_not_exists(space_id, None, dataset_id, bucket_id, None)
29+
_wait_for_space_ready(space_id)
30+
31+
32+
def _reset_test_space(space_id):
33+
try:
34+
huggingface_hub.delete_repo(space_id, repo_type="space")
35+
except RepositoryNotFoundError:
36+
return
37+
except HfHubHTTPError as e:
38+
if e.response is not None and e.response.status_code in (401, 403):
39+
raise RuntimeError(
40+
f"Cannot delete test Space {space_id}: check Hugging Face auth and write access."
41+
) from e
42+
raise
43+
44+
45+
def _wait_for_space_ready(space_id, timeout=300):
46+
hf_api = huggingface_hub.HfApi()
47+
deadline = time.time() + timeout
48+
delay = 2
49+
last_error = None
50+
last_stage = None
51+
52+
terminal_stages = {"NO_APP_FILE", "CONFIG_ERROR", "BUILD_ERROR", "RUNTIME_ERROR"}
2453

25-
deadline = time.time() + 300
2654
while time.time() < deadline:
2755
try:
28-
Client(test_space_id, verbose=False)
29-
return
30-
except Exception:
31-
time.sleep(10)
32-
pytest.fail(f"Space {test_space_id} not ready after 5 minutes")
56+
info = hf_api.space_info(space_id, timeout=30)
57+
runtime = getattr(info, "runtime", None)
58+
if runtime is not None:
59+
last_stage = str(runtime.stage)
60+
if last_stage in terminal_stages:
61+
raise RuntimeError(
62+
f"Space {space_id} entered terminal stage {last_stage}. "
63+
"Check the Hugging Face build logs."
64+
)
65+
if last_stage == "RUNNING":
66+
Client(space_id, verbose=False, httpx_kwargs={"timeout": 30})
67+
return
68+
except HfHubHTTPError as e:
69+
last_error = e
70+
if e.response is not None and e.response.status_code in (401, 403):
71+
raise RuntimeError(
72+
f"Cannot access test Space {space_id}: check Hugging Face auth and permissions."
73+
) from e
74+
except Exception as e:
75+
last_error = e
76+
77+
time.sleep(delay)
78+
delay = min(delay * 1.5, 15)
79+
80+
pytest.fail(
81+
f"Space {space_id} not ready after {timeout}s. "
82+
f"Last observed stage={last_stage!r}, last error={last_error!r}"
83+
)
3384

3485

3586
@pytest.fixture

tests/e2e-spaces/test_data_robustness.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,10 @@ def wrapped_predict(*args, **kwargs):
164164
f"Expected all 3 logs on Space after recovery, got {summary['num_logs']}"
165165
)
166166

167-
local_logs = SQLiteStorage.get_logs(project=project_name, run=run_name)
168-
assert len(local_logs) == 0, (
169-
f"Expected local buffer to be empty after flush, but found {len(local_logs)} rows"
167+
deadline = time.time() + 60
168+
while time.time() < deadline and SQLiteStorage.has_pending_data(project_name):
169+
time.sleep(2)
170+
171+
assert not SQLiteStorage.has_pending_data(project_name), (
172+
"Expected pending local buffer rows to be cleared after flush"
170173
)

tests/e2e-spaces/test_metrics_on_spaces.py

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,46 @@
99
from trackio import utils
1010

1111

12+
def _predict_run_summary(
13+
test_space_id: str,
14+
project_name: str,
15+
run_name: str,
16+
*,
17+
min_num_logs: int = 0,
18+
timeout: float = 240,
19+
):
20+
deadline = time.time() + timeout
21+
last_err: Exception | None = None
22+
flush_attempted = False
23+
while time.time() < deadline:
24+
try:
25+
client = Client(test_space_id, verbose=False)
26+
summary = client.predict(
27+
project=project_name, run=run_name, api_name="/get_run_summary"
28+
)
29+
if summary["num_logs"] >= min_num_logs:
30+
return summary
31+
last_err = None
32+
except Exception as e:
33+
last_err = e
34+
if not flush_attempted and time.time() > deadline - max(timeout - 60, 0):
35+
flush_run = trackio.init(
36+
project=project_name,
37+
name=f"flush_{secrets.token_urlsafe(4)}",
38+
space_id=test_space_id,
39+
auto_log_gpu=False,
40+
)
41+
flush_deadline = time.time() + 30
42+
while flush_run._client is None and time.time() < flush_deadline:
43+
time.sleep(0.1)
44+
flush_run.finish()
45+
flush_attempted = True
46+
time.sleep(5)
47+
if last_err is not None:
48+
raise last_err
49+
raise TimeoutError("get_run_summary timed out before logs appeared")
50+
51+
1252
def test_basic_logging(test_space_id):
1353
project_name = f"test_project_{secrets.token_urlsafe(8)}"
1454
run_name = "test_run"
@@ -18,15 +58,14 @@ def test_basic_logging(test_space_id):
1858
trackio.log(metrics={"loss": 0.2, "acc": 0.9})
1959
trackio.finish()
2060

21-
client = Client(test_space_id)
22-
23-
summary = client.predict(
24-
project=project_name, run=run_name, api_name="/get_run_summary"
61+
summary = _predict_run_summary(
62+
test_space_id, project_name, run_name, min_num_logs=2
2563
)
2664
assert summary["num_logs"] == 2
2765
assert "loss" in summary["metrics"]
2866
assert "acc" in summary["metrics"]
2967

68+
client = Client(test_space_id)
3069
loss_values = client.predict(
3170
project=project_name,
3271
run=run_name,
@@ -159,8 +198,8 @@ def test_bucket_space_preserves_logged_metrics_after_restart(test_space_id):
159198
time.sleep(10)
160199
assert client is not None, "Space did not come back up after restart"
161200

162-
summary = client.predict(
163-
project=project_name, run=run_name, api_name="/get_run_summary"
201+
summary = _predict_run_summary(
202+
test_space_id, project_name, run_name, min_num_logs=1, timeout=360
164203
)
165204
assert summary["num_logs"] == 1
166205
assert "loss" in summary["metrics"] and "acc" in summary["metrics"]

tests/e2e-spaces/test_sync_and_freeze.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ def _download_parquet_from_bucket(bucket_id, remote_name="metrics.parquet"):
2626
with tempfile.TemporaryDirectory() as tmp:
2727
local_path = Path(tmp) / remote_name
2828
huggingface_hub.download_bucket_files(
29-
bucket_id, files=[(remote_name, str(local_path))]
29+
bucket_id,
30+
files=[(remote_name, str(local_path))],
31+
token=huggingface_hub.utils.get_token(),
3032
)
3133
return pd.read_parquet(local_path)
3234

@@ -40,11 +42,20 @@ def _cleanup_space(space_id):
4042

4143
def _cleanup_bucket(bucket_id):
4244
try:
43-
huggingface_hub.delete_bucket(bucket_id)
45+
huggingface_hub.delete_bucket(
46+
bucket_id, token=huggingface_hub.utils.get_token()
47+
)
4448
except Exception:
4549
pass
4650

4751

52+
def _namespace_scoped_repo_id(test_space_id: str, repo_name: str) -> str:
53+
if "/" in test_space_id:
54+
namespace = test_space_id.split("/", 1)[0]
55+
return f"{namespace}/{repo_name}"
56+
return repo_name
57+
58+
4859
def test_sync_to_gradio_space(test_space_id, temp_dir):
4960
project_name = f"test_sync_gradio_{secrets.token_urlsafe(8)}"
5061
run_name = "run1"
@@ -76,11 +87,11 @@ def test_sync_to_gradio_space(test_space_id, temp_dir):
7687
assert loss_values[2]["value"] == 0.1
7788

7889

79-
def test_sync_to_static_space_incremental(temp_dir):
90+
def test_sync_to_static_space_incremental(test_space_id, temp_dir):
8091
project_name = f"test_sync_static_{secrets.token_urlsafe(8)}"
8192
run_name = "run1"
8293
suffix = secrets.token_urlsafe(6)
83-
space_id = f"trackio-test-static-{suffix}"
94+
space_id = _namespace_scoped_repo_id(test_space_id, f"trackio-test-static-{suffix}")
8495
space_id, _, bucket_id = utils.preprocess_space_and_dataset_ids(space_id, None)
8596

8697
try:
@@ -127,7 +138,9 @@ def test_sync_gradio_then_freeze_to_static(test_space_id, temp_dir):
127138
time.sleep(5)
128139

129140
suffix = secrets.token_urlsafe(6)
130-
frozen_space_id = f"trackio-test-frozen-{suffix}"
141+
frozen_space_id = _namespace_scoped_repo_id(
142+
test_space_id, f"trackio-test-frozen-{suffix}"
143+
)
131144
frozen_space_id, _, frozen_bucket_id = utils.preprocess_space_and_dataset_ids(
132145
frozen_space_id, None
133146
)

trackio/bucket_storage.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def _download_db_from_bucket(
6464
huggingface_hub.download_bucket_files(
6565
bucket_id,
6666
files=[(remote_path, str(local_path))],
67+
token=huggingface_hub.utils.get_token(),
6768
)
6869
return local_path.exists()
6970
except Exception:

trackio/deploy.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import tempfile
88
import threading
99
import time
10+
from collections import Counter
1011
from importlib.resources import files
1112
from pathlib import Path
1213

@@ -498,11 +499,13 @@ def sync_incremental(
498499

499500
client = Client(space_id, verbose=False, httpx_kwargs={"timeout": 90})
500501
hf_token = huggingface_hub.utils.get_token()
502+
expected_run_counts: Counter[str] = Counter()
501503

502504
if pending_only:
503505
pending_logs = SQLiteStorage.get_pending_logs(project)
504506
if pending_logs:
505507
logs = pending_logs["logs"]
508+
expected_run_counts.update(log["run"] for log in logs)
506509
for i in range(0, len(logs), SYNC_BATCH_SIZE):
507510
batch = logs[i : i + SYNC_BATCH_SIZE]
508511
print(
@@ -550,6 +553,7 @@ def sync_incremental(
550553
else:
551554
all_logs = SQLiteStorage.get_all_logs_for_sync(project)
552555
if all_logs:
556+
expected_run_counts.update(log["run"] for log in all_logs)
553557
for i in range(0, len(all_logs), SYNC_BATCH_SIZE):
554558
batch = all_logs[i : i + SYNC_BATCH_SIZE]
555559
print(
@@ -568,12 +572,52 @@ def sync_incremental(
568572
api_name="/bulk_log_system", logs=batch, hf_token=hf_token
569573
)
570574

575+
_wait_for_remote_sync(client, project, expected_run_counts)
571576
SQLiteStorage.set_project_metadata(project, "space_id", space_id)
572577
print(
573578
f"* Synced successfully to space: {_BOLD_ORANGE}{SPACE_URL.format(space_id=space_id)}{_RESET}"
574579
)
575580

576581

582+
def _wait_for_remote_sync(
583+
client: Client,
584+
project: str,
585+
expected_run_counts: Counter[str],
586+
timeout: int = 180,
587+
) -> None:
588+
if not expected_run_counts:
589+
return
590+
591+
deadline = time.time() + timeout
592+
delay = 2
593+
last_error: Exception | None = None
594+
pending = dict(expected_run_counts)
595+
596+
while time.time() < deadline and pending:
597+
completed = []
598+
for run_name, expected_num_logs in pending.items():
599+
try:
600+
summary = client.predict(
601+
project=project, run=run_name, api_name="/get_run_summary"
602+
)
603+
if summary.get("num_logs") == expected_num_logs:
604+
completed.append(run_name)
605+
except Exception as e:
606+
last_error = e
607+
for run_name in completed:
608+
pending.pop(run_name, None)
609+
if pending:
610+
time.sleep(delay)
611+
delay = min(delay * 1.5, 15)
612+
613+
if pending:
614+
raise TimeoutError(
615+
f"Remote sync for project '{project}' did not become visible for runs "
616+
f"{sorted(pending.items())} within {timeout}s. "
617+
f"Last error: {last_error!r}"
618+
)
619+
620+
577621
def upload_dataset_for_static(
578622
project: str,
579623
dataset_id: str,
@@ -831,6 +875,14 @@ def _do_sync():
831875
create_space_if_not_exists(
832876
space_id, bucket_id=bucket_id, private=private
833877
)
878+
_wait_for_remote_sync(
879+
Client(space_id, verbose=False, httpx_kwargs={"timeout": 90}),
880+
project,
881+
Counter(
882+
log["run"]
883+
for log in SQLiteStorage.get_all_logs_for_sync(project)
884+
),
885+
)
834886
else:
835887
sync_incremental(project, space_id, private=private, pending_only=False)
836888
SQLiteStorage.set_project_metadata(project, "space_id", space_id)

0 commit comments

Comments
 (0)