Skip to content

Commit e862708

Browse files
authored
Flatten metrics column in parquet representation (#106)
* When exporting to parquet, flatten the `metrics` column into multiple columns of Parquet. * When importing, do the reverse (if the parquet file doesn't already have a `metrics` column). * If the parquet file has a metrics column (legacy format), it will skip the unflattening on import, so it should continue to work. * Adds "metrics" to the reserved metric names so that it's not possible to log a metric name that will get this logic confused. Fixes #105
1 parent 7872869 commit e862708

2 files changed

Lines changed: 20 additions & 1 deletion

File tree

trackio/sqlite_storage.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ def export_to_parquet():
9393
):
9494
with sqlite3.connect(db_path) as conn:
9595
df = pd.read_sql("SELECT * from metrics", conn)
96+
# break out the single JSON metrics column into individual columns
97+
metrics = df["metrics"].copy()
98+
metrics = pd.DataFrame(
99+
metrics.apply(json.loads).values.tolist(), index=df.index
100+
)
101+
del df["metrics"]
102+
for col in metrics.columns:
103+
df[col] = metrics[col]
96104
df.to_parquet(parquet_path)
97105

98106
@staticmethod
@@ -107,6 +115,17 @@ def import_from_parquet():
107115
db_path = parquet_path.with_suffix(".db")
108116
df = pd.read_parquet(parquet_path)
109117
with sqlite3.connect(db_path) as conn:
118+
# fix up df to have a single JSON metrics column
119+
if "metrics" not in df.columns:
120+
# separate other columns from metrics
121+
metrics = df.copy()
122+
other_cols = ["id", "timestamp", "run_name", "step"]
123+
df = df[other_cols]
124+
for col in other_cols:
125+
del metrics[col]
126+
# combine them all into a single metrics col
127+
metrics = json.loads(metrics.to_json(orient="records"))
128+
df["metrics"] = [json.dumps(row) for row in metrics]
110129
df.to_sql("metrics", conn, if_exists="replace", index=False)
111130

112131
@staticmethod

trackio/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import pandas as pd
99
from huggingface_hub.constants import HF_HOME
1010

11-
RESERVED_KEYS = ["project", "run", "timestamp", "step", "time"]
11+
RESERVED_KEYS = ["project", "run", "timestamp", "step", "time", "metrics"]
1212
TRACKIO_DIR = Path(HF_HOME) / "trackio"
1313

1414
TRACKIO_LOGO_DIR = Path(__file__).parent / "assets"

0 commit comments

Comments
 (0)