feat(tensorstore): expose recheck_cached_data on TensorStoreConfig (#406)

edyoshikun · claude · web-flow · commit efff54435d6a · 2026-04-23T17:06:24.000-07:00
* feat(tensorstore): expose recheck_cached_data on TensorStoreConfig

Add ``recheck_cached_data`` to ``TensorStoreConfig`` and forward it into
``ts.open`` in ``TensorStoreImplementation.open_array``. The option controls
whether cached chunk data is revalidated on every read (the TensorStore
driver default) or only at open time (``"open"``), which is the recommended
setting for long-running read-heavy workloads on networked filesystems
(NFS/VAST) where revalidation costs one stat/GETATTR per chunk per read.

``None`` (default) preserves existing behaviour by omitting the kwarg so
the TensorStore driver keeps its own default. ``True``, ``False``, and
``"open"`` are forwarded verbatim.

Covered by a parametrized test that monkey-patches ``_ts_open`` to assert
the kwarg reaches TensorStore for each configured value and is absent when
unset.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;

* delete redudant text

---------

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/iohub/core/config.py b/src/iohub/core/config.py
@@ -34,7 +34,26 @@ class ZarrConfig(BaseModel):
 
 
 class TensorStoreConfig(BaseModel):
-    """Config for the TensorStore implementation."""
+    """Config for the TensorStore implementation.
+
+    Parameters
+    ----------
+    file_io_concurrency : int or None
+        Concurrency limit for TensorStore's ``file_io_concurrency``
+        resource. Raise above the default (32) on high-latency networked
+        filesystems (e.g. NFS) where the default under-saturates the link.
+    cache_pool_bytes : int or None
+        Aggregate byte budget for TensorStore's chunk cache pool. ``None``
+        disables caching.
+    recheck_cached_data : bool, "open" or None
+        Controls whether cached chunk data is re-validated on each read.
+        ``None`` (default) uses the TensorStore driver default, which
+        revalidates cached metadata on every access — one stat/GETATTR per
+        chunk. ``"open"`` checks freshness only when the array is opened
+        and trusts the cache thereafter — recommended for long-running
+        read-heavy workloads on NFS/VAST where the underlying zarr files
+        do not change. ``False`` disables freshness checks entirely.
+    """
 
     compressor: CompressorConfig = Field(default_factory=CompressorConfig)
     data_copy_concurrency: int = Field(default=4, ge=1)
@@ -43,6 +62,7 @@ class TensorStoreConfig(BaseModel):
     file_io_sync: bool = True
     file_io_locking: Literal["auto", "disabled"] = "auto"
     cache_pool_bytes: int | None = None
+    recheck_cached_data: bool | Literal["open"] | None = None
     extra_context: dict | None = None
 
 
diff --git a/src/iohub/core/implementations/tensorstore.py b/src/iohub/core/implementations/tensorstore.py
@@ -193,13 +193,15 @@ def open_array(self, group: zarr.Group, name: str) -> ts.TensorStore:
                 "driver": driver,
                 "kvstore": {"driver": "file", "path": key},
             }
-            self._array_cache[key] = _ts_open(
-                spec,
-                open=True,
-                read=True,
-                write=writable,
-                context=self._context(),
-            )
+            open_kwargs: dict[str, Any] = {
+                "open": True,
+                "read": True,
+                "write": writable,
+                "context": self._context(),
+            }
+            if self.config.recheck_cached_data is not None:
+                open_kwargs["recheck_cached_data"] = self.config.recheck_cached_data
+            self._array_cache[key] = _ts_open(spec, **open_kwargs)
         return self._array_cache[key]
 
     # -- Array I/O ---------------------------------------------------------