11"""The main API layer for the Trackio UI."""
22
33import base64
4+ import logging
45import os
56import re
67import secrets
78import shutil
9+ import sqlite3
10+ import threading
811import time
12+ from collections import deque
913from functools import lru_cache
1014from typing import Any
1115from urllib .parse import urlencode
2327
2428HfApi = hf .HfApi ()
2529
30+ logger = logging .getLogger ("trackio" )
31+
32+ _write_queue : deque [tuple [str , Any ]] = deque ()
33+ _flush_thread : threading .Thread | None = None
34+ _flush_lock = threading .Lock ()
35+ _FLUSH_INTERVAL = 2.0
36+ _MAX_RETRIES = 30
37+
38+
39+ def _enqueue_write (kind : str , payload : Any ) -> None :
40+ _write_queue .append ((kind , payload ))
41+ _ensure_flush_thread ()
42+
43+
44+ def _ensure_flush_thread () -> None :
45+ global _flush_thread
46+ with _flush_lock :
47+ if _flush_thread is not None and _flush_thread .is_alive ():
48+ return
49+ _flush_thread = threading .Thread (target = _flush_loop , daemon = True )
50+ _flush_thread .start ()
51+
52+
53+ def _flush_loop () -> None :
54+ retries = 0
55+ while _write_queue and retries < _MAX_RETRIES :
56+ kind , payload = _write_queue [0 ]
57+ try :
58+ if kind == "bulk_log" :
59+ SQLiteStorage .bulk_log (** payload )
60+ elif kind == "bulk_log_system" :
61+ SQLiteStorage .bulk_log_system (** payload )
62+ elif kind == "bulk_alert" :
63+ SQLiteStorage .bulk_alert (** payload )
64+ _write_queue .popleft ()
65+ retries = 0
66+ except sqlite3 .OperationalError as e :
67+ msg = str (e ).lower ()
68+ if "disk i/o error" in msg or "readonly" in msg :
69+ retries += 1
70+ logger .warning (
71+ "write queue: flush failed (%s), retry %d/%d" ,
72+ e ,
73+ retries ,
74+ _MAX_RETRIES ,
75+ )
76+ time .sleep (min (_FLUSH_INTERVAL * retries , 15.0 ))
77+ else :
78+ logger .error ("write queue: non-retryable error (%s), dropping entry" , e )
79+ _write_queue .popleft ()
80+ retries = 0
81+ if _write_queue :
82+ logger .error (
83+ "write queue: giving up after %d retries, %d entries dropped" ,
84+ _MAX_RETRIES ,
85+ len (_write_queue ),
86+ )
87+ _write_queue .clear ()
88+
89+
2690write_token = secrets .token_urlsafe (32 )
2791
2892OAUTH_CALLBACK_PATH = "/login/callback"
@@ -345,14 +409,18 @@ def bulk_log(
345409
346410 for (project , run ), data in logs_by_run .items ():
347411 has_log_ids = any (lid is not None for lid in data ["log_ids" ])
348- SQLiteStorage . bulk_log (
412+ payload = dict (
349413 project = project ,
350414 run = run ,
351415 metrics_list = data ["metrics" ],
352416 steps = data ["steps" ],
353417 config = data ["config" ],
354418 log_ids = data ["log_ids" ] if has_log_ids else None ,
355419 )
420+ try :
421+ SQLiteStorage .bulk_log (** payload )
422+ except sqlite3 .OperationalError :
423+ _enqueue_write ("bulk_log" , payload )
356424
357425
358426def bulk_log_system (
@@ -372,13 +440,17 @@ def bulk_log_system(
372440
373441 for (project , run ), data in logs_by_run .items ():
374442 has_log_ids = any (lid is not None for lid in data ["log_ids" ])
375- SQLiteStorage . bulk_log_system (
443+ payload = dict (
376444 project = project ,
377445 run = run ,
378446 metrics_list = data ["metrics" ],
379447 timestamps = data ["timestamps" ],
380448 log_ids = data ["log_ids" ] if has_log_ids else None ,
381449 )
450+ try :
451+ SQLiteStorage .bulk_log_system (** payload )
452+ except sqlite3 .OperationalError :
453+ _enqueue_write ("bulk_log_system" , payload )
382454
383455
384456def bulk_alert (
@@ -408,7 +480,7 @@ def bulk_alert(
408480
409481 for (project , run ), data in alerts_by_run .items ():
410482 has_alert_ids = any (aid is not None for aid in data ["alert_ids" ])
411- SQLiteStorage . bulk_alert (
483+ payload = dict (
412484 project = project ,
413485 run = run ,
414486 titles = data ["titles" ],
@@ -418,6 +490,10 @@ def bulk_alert(
418490 timestamps = data ["timestamps" ],
419491 alert_ids = data ["alert_ids" ] if has_alert_ids else None ,
420492 )
493+ try :
494+ SQLiteStorage .bulk_alert (** payload )
495+ except sqlite3 .OperationalError :
496+ _enqueue_write ("bulk_alert" , payload )
421497
422498
423499def get_alerts (
0 commit comments