Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions src/iohub/core/ozx.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from __future__ import annotations

import json
import stat
import logging
import shutil
import zipfile
Expand All @@ -37,6 +38,21 @@
_CENTRAL_DIRECTORY = "centralDirectory"
_JSON_FIRST = "jsonFirst"

# Reproducibility: pin every variable field on each ZipInfo so two
# `pack_ozx` runs over the same source produce byte-identical archives.
# The defaults Python's ``zipfile`` would otherwise pick — wall-clock
# mtime, OS-specific create_system, OS-specific external_attr — make
# downstream sha256 verification meaningless across machines and runs.
#
# - 1980-01-01 is the earliest representable zip timestamp.
# - create_system 3 is the canonical Unix creator; using it on every
# platform ensures Windows-side packs match Linux-side packs.
# - external_attr is a regular-file 0o644 in the upper 16 bits, where
# external file attributes live for Unix-created entries.
_REPRODUCIBLE_DATE_TIME = (1980, 1, 1, 0, 0, 0)
_REPRODUCIBLE_FILE_ATTR = (stat.S_IFREG | 0o644) << 16
_CREATE_SYSTEM_UNIX = 3


def is_ozx_path(path: str | Path) -> bool:
"""Return True when ``path`` ends with ``.ozx``.
Expand Down Expand Up @@ -220,6 +236,22 @@ def _bfs_order(names: Iterable[str]) -> list[str]:
return [name for _, name in meta] + chunks


def _reproducible_zip_info(arcname: str) -> zipfile.ZipInfo:
"""Build a ``ZipInfo`` with every variable header field pinned.

Pass to ``zipfile.ZipFile.open(zinfo, mode="w")`` instead of a
bare arcname — that path lets Python stamp ``date_time``,
``create_system``, and ``external_attr`` from runtime state,
which is what makes consecutive ``pack_ozx`` runs produce
different bytes for the same source.
"""
zinfo = zipfile.ZipInfo(filename=arcname, date_time=_REPRODUCIBLE_DATE_TIME)
zinfo.compress_type = zipfile.ZIP_STORED
zinfo.create_system = _CREATE_SYSTEM_UNIX
zinfo.external_attr = _REPRODUCIBLE_FILE_ATTR
return zinfo


def _write_ozx_archive(
out_path: Path,
ordered: list[str],
Expand All @@ -230,7 +262,9 @@ def _write_ozx_archive(
"""Write an RFC-9 ``.ozx`` to ``out_path`` from a callable entry source.

Owns the destination ``ZipFile``, the archive comment, and the
unlink-on-failure cleanup.
unlink-on-failure cleanup. Each entry's header is built from
:func:`_reproducible_zip_info` so the resulting bytes are stable
across machines and re-runs given identical input.
"""
try:
with zipfile.ZipFile(
Expand All @@ -240,7 +274,11 @@ def _write_ozx_archive(
allowZip64=True,
) as zout:
for arcname in ordered:
with open_member(arcname) as src_f, zout.open(arcname, mode="w") as dst_f:
zinfo = _reproducible_zip_info(arcname)
with (
open_member(arcname) as src_f,
zout.open(zinfo, mode="w", force_zip64=True) as dst_f,
):
shutil.copyfileobj(src_f, dst_f, length=_COPY_BUFFER_BYTES)
zout.comment = _build_comment(version, json_first=True)
except Exception:
Expand Down
62 changes: 62 additions & 0 deletions tests/ngff/test_ozx.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,68 @@ def test_pack_ozx_from_directory(tmp_path: Path) -> None:
pack_ozx(src_dir, dst)


def test_pack_ozx_is_byte_reproducible(tmp_path: Path) -> None:
"""Two pack_ozx runs over the same source produce byte-identical archives.

Without pinning ``date_time`` / ``create_system`` / ``external_attr``
on each ``ZipInfo``, Python's default ``zipfile`` behaviour stamps
wall-clock mtimes into every Local File Header, so consecutive packs
diverge at the byte level — and any sha256-based artifact integrity
check (Croissant ``cr:FileObject.sha256``, OZX MANIFEST.json) fails.
"""
src_dir = tmp_path / "src.zarr"
with open_ome_zarr(
src_dir, layout="fov", mode="w", channel_names=["c"], version="0.5"
) as pos:
arr = pos.create_zeros(
"0", shape=(1, 1, 1, 4, 4), dtype=np.uint8, chunks=(1, 1, 1, 2, 2)
)
arr[:] = np.arange(16, dtype=np.uint8).reshape(arr.shape)

a = tmp_path / "a.ozx"
b = tmp_path / "b.ozx"
pack_ozx(src_dir, a)
pack_ozx(src_dir, b)
assert a.read_bytes() == b.read_bytes()


def test_pack_ozx_pinned_zip_metadata(tmp_path: Path) -> None:
"""Every entry's ZipInfo has the pinned date_time / create_system / external_attr.

Guards the upper-half of the reproducibility chain — the round-trip
test confirms the bytes match across two runs, this confirms
*which* fields were pinned. If a future refactor drops one of the
pins, that field starts drifting silently across machines but the
byte-equality test still passes for two same-machine same-second
runs; this assertion catches that.
"""
src_dir = tmp_path / "src.zarr"
with open_ome_zarr(
src_dir, layout="fov", mode="w", channel_names=["c"], version="0.5"
) as pos:
pos.create_zeros(
"0", shape=(1, 1, 1, 2, 2), dtype=np.uint8, chunks=(1, 1, 1, 2, 2)
)
dst = tmp_path / "out.ozx"
pack_ozx(src_dir, dst)

with zipfile.ZipFile(dst) as zf:
infos = zf.infolist()
assert infos, "expected at least one zip entry"
expected_attr = (0o100000 | 0o644) << 16 # stat.S_IFREG | 0o644, shifted
for info in infos:
assert info.date_time == (1980, 1, 1, 0, 0, 0), (
f"{info.filename}: date_time {info.date_time} not pinned"
)
assert info.create_system == 3, (
f"{info.filename}: create_system {info.create_system} not Unix (3)"
)
assert info.external_attr == expected_attr, (
f"{info.filename}: external_attr {oct(info.external_attr)} "
f"not {oct(expected_attr)}"
)


def test_tensorstore_rejected_for_ozx(tmp_path: Path) -> None:
"""TS cannot write .ozx (its zip kvstore is read-only). Fail loud, not late."""
pytest.importorskip("tensorstore")
Expand Down