diff --git a/src/iohub/core/ozx.py b/src/iohub/core/ozx.py index f4791f6c..6b4b1400 100644 --- a/src/iohub/core/ozx.py +++ b/src/iohub/core/ozx.py @@ -12,6 +12,7 @@ from __future__ import annotations import json +import stat import logging import shutil import zipfile @@ -37,6 +38,21 @@ _CENTRAL_DIRECTORY = "centralDirectory" _JSON_FIRST = "jsonFirst" +# Reproducibility: pin every variable field on each ZipInfo so two +# `pack_ozx` runs over the same source produce byte-identical archives. +# The defaults Python's ``zipfile`` would otherwise pick — wall-clock +# mtime, OS-specific create_system, OS-specific external_attr — make +# downstream sha256 verification meaningless across machines and runs. +# +# - 1980-01-01 is the earliest representable zip timestamp. +# - create_system 3 is the canonical Unix creator; using it on every +# platform ensures Windows-side packs match Linux-side packs. +# - external_attr is a regular-file 0o644 in the upper 16 bits, where +# external file attributes live for Unix-created entries. +_REPRODUCIBLE_DATE_TIME = (1980, 1, 1, 0, 0, 0) +_REPRODUCIBLE_FILE_ATTR = (stat.S_IFREG | 0o644) << 16 +_CREATE_SYSTEM_UNIX = 3 + def is_ozx_path(path: str | Path) -> bool: """Return True when ``path`` ends with ``.ozx``. @@ -220,6 +236,22 @@ def _bfs_order(names: Iterable[str]) -> list[str]: return [name for _, name in meta] + chunks +def _reproducible_zip_info(arcname: str) -> zipfile.ZipInfo: + """Build a ``ZipInfo`` with every variable header field pinned. + + Pass to ``zipfile.ZipFile.open(zinfo, mode="w")`` instead of a + bare arcname — that path lets Python stamp ``date_time``, + ``create_system``, and ``external_attr`` from runtime state, + which is what makes consecutive ``pack_ozx`` runs produce + different bytes for the same source. + """ + zinfo = zipfile.ZipInfo(filename=arcname, date_time=_REPRODUCIBLE_DATE_TIME) + zinfo.compress_type = zipfile.ZIP_STORED + zinfo.create_system = _CREATE_SYSTEM_UNIX + zinfo.external_attr = _REPRODUCIBLE_FILE_ATTR + return zinfo + + def _write_ozx_archive( out_path: Path, ordered: list[str], @@ -230,7 +262,9 @@ def _write_ozx_archive( """Write an RFC-9 ``.ozx`` to ``out_path`` from a callable entry source. Owns the destination ``ZipFile``, the archive comment, and the - unlink-on-failure cleanup. + unlink-on-failure cleanup. Each entry's header is built from + :func:`_reproducible_zip_info` so the resulting bytes are stable + across machines and re-runs given identical input. """ try: with zipfile.ZipFile( @@ -240,7 +274,11 @@ def _write_ozx_archive( allowZip64=True, ) as zout: for arcname in ordered: - with open_member(arcname) as src_f, zout.open(arcname, mode="w") as dst_f: + zinfo = _reproducible_zip_info(arcname) + with ( + open_member(arcname) as src_f, + zout.open(zinfo, mode="w", force_zip64=True) as dst_f, + ): shutil.copyfileobj(src_f, dst_f, length=_COPY_BUFFER_BYTES) zout.comment = _build_comment(version, json_first=True) except Exception: diff --git a/tests/ngff/test_ozx.py b/tests/ngff/test_ozx.py index 4e275104..f4aed86c 100644 --- a/tests/ngff/test_ozx.py +++ b/tests/ngff/test_ozx.py @@ -87,6 +87,68 @@ def test_pack_ozx_from_directory(tmp_path: Path) -> None: pack_ozx(src_dir, dst) +def test_pack_ozx_is_byte_reproducible(tmp_path: Path) -> None: + """Two pack_ozx runs over the same source produce byte-identical archives. + + Without pinning ``date_time`` / ``create_system`` / ``external_attr`` + on each ``ZipInfo``, Python's default ``zipfile`` behaviour stamps + wall-clock mtimes into every Local File Header, so consecutive packs + diverge at the byte level — and any sha256-based artifact integrity + check (Croissant ``cr:FileObject.sha256``, OZX MANIFEST.json) fails. + """ + src_dir = tmp_path / "src.zarr" + with open_ome_zarr( + src_dir, layout="fov", mode="w", channel_names=["c"], version="0.5" + ) as pos: + arr = pos.create_zeros( + "0", shape=(1, 1, 1, 4, 4), dtype=np.uint8, chunks=(1, 1, 1, 2, 2) + ) + arr[:] = np.arange(16, dtype=np.uint8).reshape(arr.shape) + + a = tmp_path / "a.ozx" + b = tmp_path / "b.ozx" + pack_ozx(src_dir, a) + pack_ozx(src_dir, b) + assert a.read_bytes() == b.read_bytes() + + +def test_pack_ozx_pinned_zip_metadata(tmp_path: Path) -> None: + """Every entry's ZipInfo has the pinned date_time / create_system / external_attr. + + Guards the upper-half of the reproducibility chain — the round-trip + test confirms the bytes match across two runs, this confirms + *which* fields were pinned. If a future refactor drops one of the + pins, that field starts drifting silently across machines but the + byte-equality test still passes for two same-machine same-second + runs; this assertion catches that. + """ + src_dir = tmp_path / "src.zarr" + with open_ome_zarr( + src_dir, layout="fov", mode="w", channel_names=["c"], version="0.5" + ) as pos: + pos.create_zeros( + "0", shape=(1, 1, 1, 2, 2), dtype=np.uint8, chunks=(1, 1, 1, 2, 2) + ) + dst = tmp_path / "out.ozx" + pack_ozx(src_dir, dst) + + with zipfile.ZipFile(dst) as zf: + infos = zf.infolist() + assert infos, "expected at least one zip entry" + expected_attr = (0o100000 | 0o644) << 16 # stat.S_IFREG | 0o644, shifted + for info in infos: + assert info.date_time == (1980, 1, 1, 0, 0, 0), ( + f"{info.filename}: date_time {info.date_time} not pinned" + ) + assert info.create_system == 3, ( + f"{info.filename}: create_system {info.create_system} not Unix (3)" + ) + assert info.external_attr == expected_attr, ( + f"{info.filename}: external_attr {oct(info.external_attr)} " + f"not {oct(expected_attr)}" + ) + + def test_tensorstore_rejected_for_ozx(tmp_path: Path) -> None: """TS cannot write .ozx (its zip kvstore is read-only). Fail loud, not late.""" pytest.importorskip("tensorstore")