Skip to content

Commit 9beb2a4

Browse files
authored
fix: fix the chunking along Z (can handle prime numbers now) (#376)
* fix: fix the chunking along Z (can handle prime numbers now) Signed-off-by: Sricharan Reddy Varra <sricharan.varra@biohub.org> * fix: simplified chunk sizes for conversion Signed-off-by: Sricharan Reddy Varra <sricharan.varra@biohub.org> --------- Signed-off-by: Sricharan Reddy Varra <sricharan.varra@biohub.org>
1 parent 481451e commit 9beb2a4

4 files changed

Lines changed: 74 additions & 46 deletions

File tree

src/iohub/convert.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from pathlib import Path
55
from typing import Literal
66

7-
import dask
87
import numpy as np
98
from tqdm import tqdm
109
from tqdm.contrib.itertools import product
@@ -13,7 +12,7 @@
1312
from iohub.ngff.models import TransformationMeta
1413
from iohub.ngff.nodes import Position, open_ome_zarr
1514
from iohub.ngff.utils import (
16-
_adjust_chunks_for_divisibility,
15+
_clamp_chunks_to_shape,
1716
_limit_zyx_chunk_size,
1817
)
1918
from iohub.reader import MMStack, NDTiffDataset, read_images
@@ -265,11 +264,11 @@ def _gen_chunks(self, input_chunks):
265264
chunk_zyx_shape = _limit_zyx_chunk_size(shape, bytes_per_pixel, MAX_CHUNK_SIZE, chunks=chunks)
266265
chunks[-3:] = list(chunk_zyx_shape)
267266

268-
# Adjust chunks to divide evenly into dimensions
269-
chunks = _adjust_chunks_for_divisibility(shape, chunks)
267+
# Clamp chunks so they don't exceed dimension sizes
268+
chunks = _clamp_chunks_to_shape(shape, chunks)
270269
for i, (orig, adj, dim) in enumerate(zip(original_chunks, chunks, shape, strict=False)):
271-
if orig != adj:
272-
_logger.warning(f"Chunk size {orig} on axis {i} adjusted to {adj} (dimension {dim}).")
270+
if adj < orig:
271+
_logger.warning(f"Chunk size {orig} on axis {i} clamped to {adj} (dimension size {dim}).")
273272

274273
_logger.debug(f"Zarr store chunk size will be set to {chunks}.")
275274

@@ -384,13 +383,9 @@ def __call__(self) -> None:
384383
_logger.debug("Setting up Zarr store.")
385384

386385
self._init_zarr_arrays()
387-
# Calculate chunk size in bytes for dask config
388-
# This prevents data loss when rechunking to zarr chunks
389-
# See: https://github.com/czbiohub-sf/iohub/issues/367
390-
chunk_size_bytes = int(np.prod(self.chunks) * np.dtype(self.reader.dtype).itemsize)
391386

392387
_logger.debug("Converting images.")
393-
with logging_redirect_tqdm(), dask.config.set({"array.chunk-size": chunk_size_bytes}):
388+
with logging_redirect_tqdm():
394389
for zarr_pos_name, (_, fov) in tqdm(
395390
zip(self.zarr_position_names, self.reader, strict=True),
396391
total=len(self.zarr_position_names),

src/iohub/core/implementations/zarr_python.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,10 @@ def to_dask(self, handle: zarr.Array) -> Any:
139139
return da.from_zarr(handle)
140140

141141
def write_from_dask(self, handle: zarr.Array, dask_array: Any) -> None:
142-
from dask.array import to_zarr
142+
from iohub.core.downsample import iter_work_regions
143143

144-
to_zarr(dask_array, handle)
144+
for region in iter_work_regions(handle.shape, handle.chunks):
145+
handle[region] = dask_array[region].compute()
145146

146147
# -- High-performance operations ---------------------------------------
147148

src/iohub/ngff/utils.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -447,8 +447,8 @@ def _limit_zyx_chunk_size(
447447
return chunk_zyx_shape
448448

449449

450-
def _adjust_chunks_for_divisibility(shape: tuple[int, ...], chunks: tuple[int, ...]) -> tuple[int, ...]:
451-
"""Adjust chunks to divide evenly into dimensions for Dask.
450+
def _clamp_chunks_to_shape(shape: tuple[int, ...], chunks: tuple[int, ...]) -> tuple[int, ...]:
451+
"""Clamp chunk sizes so they don't exceed dimension sizes.
452452
453453
Parameters
454454
----------
@@ -460,18 +460,6 @@ def _adjust_chunks_for_divisibility(shape: tuple[int, ...], chunks: tuple[int, .
460460
Returns
461461
-------
462462
tuple[int, ...]
463-
Adjusted chunk sizes that divide evenly into dimensions.
463+
Chunk sizes clamped to at most the dimension size.
464464
"""
465-
shape = list(shape)
466-
chunks = list(chunks)
467-
adjusted = []
468-
for chunk, dim in zip(chunks, shape, strict=False):
469-
if chunk > dim:
470-
adjusted.append(dim)
471-
elif dim % chunk != 0:
472-
while chunk > 1 and dim % chunk != 0:
473-
chunk -= 1
474-
adjusted.append(chunk)
475-
else:
476-
adjusted.append(chunk)
477-
return tuple(adjusted)
465+
return tuple(min(c, d) for c, d in zip(chunks, shape, strict=False))

tests/test_converter.py

Lines changed: 61 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from ndtiff import Dataset
99
from tifffile import TiffFile
1010

11-
from iohub.convert import TIFFConverter, _adjust_chunks_for_divisibility
11+
from iohub.convert import TIFFConverter, _clamp_chunks_to_shape
1212
from iohub.ngff import Position, open_ome_zarr
1313
from iohub.reader import MMStack, NDTiffDataset
1414
from tests.conftest import (
@@ -54,7 +54,7 @@ def _check_chunks(position: Position, chunks: Literal["XY", "XYZ"] | tuple[int]
5454
case "XYZ" | None:
5555
assert img.chunks == (1,) * 2 + img.shape[-3:]
5656
case tuple():
57-
expected = _adjust_chunks_for_divisibility(img.shape, chunks)
57+
expected = _clamp_chunks_to_shape(img.shape, chunks)
5858
assert img.chunks == expected
5959
case _:
6060
raise AssertionError()
@@ -264,34 +264,41 @@ class MockReader:
264264
@pytest.mark.parametrize(
265265
("z_dim", "input_chunk", "expected_z"),
266266
[
267-
(15, 10, 5), # 10 doesn't divide 15, adjust to 5
268-
(7, 5, 1), # prime - falls to 1
269-
(16, 8, 8), # divides evenly, no change
270-
(100, 50, 50), # divides evenly
271-
(9, 4, 3), # odd non-prime, adjust to 3
267+
(15, 10, 10), # 10 < 15, no clamping needed
268+
(7, 5, 5), # 5 < 7, no clamping needed
269+
(16, 8, 8), # 8 < 16, no clamping needed
270+
(100, 50, 50), # 50 < 100, no clamping needed
271+
(9, 4, 4), # 4 < 9, no clamping needed
272+
(5, 10, 5), # 10 > 5, clamped to 5
272273
],
273274
)
274-
def test_gen_chunks_divisibility(self, make_mock_converter, z_dim, input_chunk, expected_z):
275-
"""Test chunks are adjusted to divide evenly into dimensions."""
275+
def test_gen_chunks_clamping(self, make_mock_converter, z_dim, input_chunk, expected_z):
276+
"""Test chunks are clamped to dimension size but not reduced for divisibility."""
276277
converter = make_mock_converter(z=z_dim)
277278
chunks = converter._gen_chunks((1, 1, input_chunk, 256, 256))
278279
assert chunks[2] == expected_z
279-
assert z_dim % chunks[2] == 0
280+
assert chunks[2] <= z_dim
280281

281-
def test_gen_chunks_max_chunk_size_then_divisibility(self, make_mock_converter):
282-
"""Test divisibility check happens AFTER MAX_CHUNK_SIZE adjustment."""
283-
# Large chunks that trigger MAX_CHUNK_SIZE, then need divisibility fix
284-
# Z=15, large XY to trigger size limit, chunk should end up dividing 15
282+
def test_gen_chunks_max_chunk_size_then_clamp(self, make_mock_converter):
283+
"""Test clamping happens AFTER MAX_CHUNK_SIZE adjustment."""
285284
converter = make_mock_converter(z=15, y=2048, x=2048)
286285
chunks = converter._gen_chunks((1, 1, 15, 2048, 2048))
287-
assert 15 % chunks[2] == 0
286+
assert chunks[2] <= 15
288287

289288
@pytest.mark.parametrize("z_dim", [7, 11, 13, 17, 19, 23]) # prime numbers
290289
def test_gen_chunks_prime_dimensions(self, make_mock_converter, z_dim):
291-
"""Test handling of prime dimension sizes - should fall to 1."""
290+
"""Test prime dimensions do NOT reduce chunk to 1."""
292291
converter = make_mock_converter(z=z_dim)
293292
chunks = converter._gen_chunks((1, 1, z_dim - 1, 256, 256))
294-
assert z_dim % chunks[2] == 0
293+
assert chunks[2] == z_dim - 1 # should stay as-is (< dim)
294+
assert chunks[2] > 1
295+
296+
def test_gen_chunks_prime_z_1187(self, make_mock_converter):
297+
"""Regression: Z=1187 (prime) should not reduce chunk to 1."""
298+
converter = make_mock_converter(z=1187, y=2048, x=2048)
299+
chunks = converter._gen_chunks("XYZ")
300+
assert chunks[2] > 1
301+
assert chunks[2] <= 1187
295302

296303
@pytest.mark.parametrize("input_chunks", ["XY", "XYZ", None])
297304
def test_gen_chunks_string_inputs(self, make_mock_converter, input_chunks):
@@ -342,6 +349,43 @@ def test_rechunk_xy_to_xyz_preserves_data(shape, target_chunks, tmpdir):
342349
np.testing.assert_array_equal(data, written, err_msg="Data lost during XY to XYZ rechunking")
343350

344351

352+
@pytest.mark.parametrize(
353+
("shape", "target_chunks"),
354+
[
355+
# Non-divisible Z: 1187 is prime, chunk 512 doesn't divide evenly
356+
((1, 1, 1187, 256, 256), (1, 1, 512, 256, 256)),
357+
# Another non-divisible case
358+
((1, 1, 50, 512, 512), (1, 1, 32, 512, 512)),
359+
],
360+
)
361+
def test_rechunk_non_divisible_z_preserves_data(shape, target_chunks, tmpdir):
362+
"""Test that rechunking with non-divisible Z chunks preserves all data.
363+
364+
Regression test for https://github.com/czbiohub-sf/iohub/issues/374
365+
"""
366+
import dask
367+
import dask.array as da
368+
369+
data = np.arange(np.prod(shape), dtype=np.uint16).reshape(shape)
370+
371+
source_chunks = (1, 1, 1, shape[3], shape[4])
372+
dask_data = da.from_array(data, chunks=source_chunks)
373+
374+
output = tmpdir / "test_rechunk_nondiv.zarr"
375+
with open_ome_zarr(output, layout="hcs", mode="w-", channel_names=["test"], version="0.5") as writer:
376+
pos = writer.create_position("0", "0", "0")
377+
zarr_img = pos.create_zeros(name="0", shape=shape, dtype=np.uint16, chunks=target_chunks)
378+
379+
chunk_size_bytes = int(np.prod(target_chunks) * 2)
380+
with dask.config.set({"array.chunk-size": chunk_size_bytes}):
381+
zarr_img.write_from_dask(dask_data.rechunk(target_chunks))
382+
383+
with open_ome_zarr(output, layout="hcs", mode="r") as reader:
384+
written = reader["0/0/0"]["0"][:]
385+
386+
np.testing.assert_array_equal(data, written, err_msg="Data lost during non-divisible Z rechunking")
387+
388+
345389
class TestVersionParameter:
346390
"""Tests for the OME-NGFF version parameter on TIFFConverter."""
347391

0 commit comments

Comments
 (0)