Skip to content

Commit 610d8a4

Browse files
authored
pef: Lazy metadata access (#381)
fix: Replace eager metadata parsing with lazy cached_property resolution and eliminate pandas from validation hot path Signed-off-by: Sricharan Reddy Varra <sricharan.varra@biohub.org>
1 parent 3c47e29 commit 610d8a4

2 files changed

Lines changed: 58 additions & 46 deletions

File tree

src/iohub/ngff/models.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import re
1313
from typing import Annotated, Any, Literal, Optional
1414

15-
import pandas as pd
1615
from pydantic import (
1716
AfterValidator,
1817
BaseModel,
@@ -49,11 +48,9 @@ def unique_validator(data: list[BaseModel], field: str | list[str]) -> list[Base
4948
raised if any value is not unique
5049
"""
5150
fields = [field] if isinstance(field, str) else field
52-
if not isinstance(data[0], dict):
53-
params = [d.model_dump() for d in data]
54-
df = pd.DataFrame(params)
5551
for key in fields:
56-
if not df[key].is_unique:
52+
values = [d[key] if isinstance(d, dict) else getattr(d, key) for d in data]
53+
if len(values) != len(set(values)):
5754
raise ValueError(f"'{key}' must be unique!")
5855
return data
5956

src/iohub/ngff/nodes.py

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import shutil
1212
from copy import deepcopy
1313
from datetime import datetime
14+
from functools import cached_property
1415
from pathlib import Path
1516
from typing import Generator, Literal, Sequence, Type, TypeAlias, overload
1617

@@ -112,23 +113,31 @@ def __init__(
112113
version: Literal["0.4", "0.5"] = "0.4",
113114
overwriting_creation: bool = False,
114115
):
115-
if channel_names:
116-
self._channel_names = channel_names
116+
if channel_names is not None:
117+
self.channel_names = channel_names
117118
elif not parse_meta:
118119
raise ValueError("Channel names need to be provided or in metadata.")
119-
if axes:
120+
if axes is not None:
120121
self.axes = axes
121122
self._group = group
122123
self._overwrite = overwriting_creation
123124
self._version: Literal["0.4", "0.5"] = version
124125
if parse_meta:
125126
self._parse_meta()
126-
if not hasattr(self, "axes"):
127-
self.axes = self._DEFAULT_AXES
128127
# TODO: properly check the underlying storage type
129128
# This works for now as only the local filesystem is supported
130129
self._case_insensitive_fs = _case_insensitive_local_fs()
131130

131+
@cached_property
132+
def axes(self):
133+
"""Axes metadata. Lazily resolves to defaults if not set."""
134+
return self._DEFAULT_AXES
135+
136+
@cached_property
137+
def channel_names(self):
138+
"""Channel names. Subclasses override for lazy resolution."""
139+
raise AttributeError("Channel names not available. Provide channel_names or ensure metadata is parseable.")
140+
132141
@property
133142
def zgroup(self):
134143
"""Corresponding Zarr group of the node."""
@@ -150,10 +159,6 @@ def version(self) -> Literal["0.4", "0.5"]:
150159
"""NGFF version"""
151160
return self._version
152161

153-
@property
154-
def channel_names(self):
155-
return self._channel_names
156-
157162
@property
158163
def _parent_path(self):
159164
"""The parent Zarr group path of the node.
@@ -174,7 +179,7 @@ def _child_attrs(self):
174179
return dict(
175180
version=self._version,
176181
axes=self.axes,
177-
channel_names=self._channel_names,
182+
channel_names=self.channel_names,
178183
overwriting_creation=self._overwrite,
179184
)
180185

@@ -294,13 +299,9 @@ def get_channel_index(self, name: str):
294299
int
295300
Index of the channel.
296301
"""
297-
if not hasattr(self, "_channel_names"):
298-
raise AttributeError(
299-
f"Channel names are not set for this NGFF node. Cannot get the index for channel name '{name}'"
300-
)
301-
if name not in self._channel_names:
302-
raise ValueError(f"Channel {name} is not in the existing channels: {self._channel_names}")
303-
return self._channel_names.index(name)
302+
if name not in self.channel_names:
303+
raise ValueError(f"Channel {name} is not in the existing channels: {self.channel_names}")
304+
return self.channel_names.index(name)
304305

305306
def _warn_invalid_meta(self):
306307
msg = "Zarr group at {} does not have valid metadata for {}".format(self._group.path, type(self))
@@ -580,11 +581,11 @@ def __init__(
580581
def _set_meta(self):
581582
self.axes = self.metadata.multiscales[0].axes
582583
if self.metadata.omero is not None:
583-
self._channel_names = [c.label for c in self.metadata.omero.channels]
584+
self.channel_names = [c.label for c in self.metadata.omero.channels]
584585
else:
585586
_logger.warning("OMERO metadata not found. Using channel indices as channel names.")
586587
example_image: ImageArray = self[self.metadata.multiscales[0].datasets[0].path]
587-
self._channel_names = list(range(example_image.channels))
588+
self.channel_names = list(range(example_image.channels))
588589

589590
def _parse_meta(self):
590591
try:
@@ -900,9 +901,9 @@ def append_channel(self, chan_name: str, resize_arrays: bool = True):
900901
Whether to resize all the image arrays for the new channel,
901902
by default True
902903
"""
903-
if chan_name in self._channel_names:
904+
if chan_name in self.channel_names:
904905
raise ValueError(f"Channel name '{chan_name}' already exists.")
905-
self._channel_names.append(chan_name)
906+
self.channel_names.append(chan_name)
906907
if resize_arrays:
907908
for _, img in self.images():
908909
ch_ax = self._get_channel_axis()
@@ -930,7 +931,7 @@ def rename_channel(self, old: str, new: str):
930931
New name of the channel
931932
"""
932933
ch_idx = self.get_channel_index(old)
933-
self._channel_names[ch_idx] = new
934+
self.channel_names[ch_idx] = new
934935
if hasattr(self.metadata, "omero"):
935936
self.metadata.omero.channels[ch_idx].label = new
936937
self.dump_meta()
@@ -1820,26 +1821,40 @@ def _parse_meta(self):
18201821
self.metadata = PlateMeta(**plate_meta)
18211822
else:
18221823
self._warn_invalid_meta()
1823-
for attr in ("_channel_names", "axes"):
1824-
if not hasattr(self, attr):
1825-
self._first_pos_attr(attr)
1826-
1827-
def _first_pos_attr(self, attr: str):
1828-
"""Get attribute value from the first position."""
1829-
name = " ".join(attr.split("_")).strip()
1830-
msg = f"Cannot determine {name}:"
1831-
try:
1832-
row_grp = next(self.zgroup.groups())[1]
1833-
well_grp = next(row_grp.groups())[1]
1834-
pos_grp = next(well_grp.groups())[1]
1835-
except StopIteration:
1836-
_logger.warning(f"{msg} No position is found in the dataset.")
1837-
return
1824+
1825+
@cached_property
1826+
def _first_pos(self):
1827+
"""Get first position by direct path lookup (O(1)).
1828+
1829+
Uses already-parsed PlateMeta to get the first well path,
1830+
then reads that well's zattrs for the first position path.
1831+
Avoids zarr v3's eager ``Group.groups()`` enumeration.
1832+
"""
18381833
try:
1839-
pos = Position(pos_grp)
1840-
setattr(self, attr, getattr(pos, attr))
1841-
except AttributeError:
1842-
_logger.warning(f"{msg} Invalid metadata at the first position")
1834+
well_path = self.metadata.wells[0].path
1835+
well_grp = self.zgroup[well_path]
1836+
attrs = well_grp.attrs.get("ome") or dict(well_grp.attrs)
1837+
pos_name = attrs["well"]["images"][0]["path"]
1838+
return Position(
1839+
group=well_grp[pos_name],
1840+
parse_meta=True,
1841+
version=self._version,
1842+
)
1843+
except (IndexError, KeyError, AttributeError):
1844+
_logger.warning("Cannot read first position metadata.")
1845+
return None
1846+
1847+
@cached_property
1848+
def channel_names(self):
1849+
if pos := self._first_pos:
1850+
return pos.channel_names
1851+
raise AttributeError("No position found to read channel names from.")
1852+
1853+
@cached_property
1854+
def axes(self):
1855+
if pos := self._first_pos:
1856+
return pos.axes
1857+
return self._DEFAULT_AXES
18431858

18441859
def dump_meta(self, field_count: bool = False):
18451860
"""Dumps metadata JSON to the `.zattrs` file.

0 commit comments

Comments
 (0)