Merge pull request #30 from haeussma/from-enzymeml-inhomogeneous-data-shape

JR-1991 · web-flow · commit 545d24ecbd1d · 2026-04-27T16:53:28.000+02:00
Fix `ValidationError` for inhomogeneous EnzymeML measurement data
diff --git a/catalax/__init__.py b/catalax/__init__.py
@@ -22,7 +22,7 @@
     "from_enzymeml",
 ]
 
-__version__ = "0.5.4"
+__version__ = "0.5.5"
 
 PARAMETERS = InAxes.PARAMETERS
 TIME = InAxes.TIME
diff --git a/catalax/dataset/dataset.py b/catalax/dataset/dataset.py
@@ -494,19 +494,36 @@ def from_enzymeml(
     ) -> Dataset:
         """Create a dataset from an EnzymeML document.
 
+        First scans all measurements to determine the global maximum data-array
+        length (``global_max_len``), then constructs each ``Measurement`` with
+        arrays padded to that common length.  Within each measurement, species
+        with shorter time arrays are position-aligned onto the canonical (longest)
+        time axis with ``NaN`` at unsampled positions.  Across measurements,
+        shorter time arrays are extended with monotonic continuation and data
+        padded with ``NaN``.
+
         Args:
-            enzmldoc: EnzymeML document containing experimental data
+            enzmldoc: EnzymeML document containing experimental data.
 
         Returns:
-            A new Dataset object with measurements extracted from the EnzymeML document
+            Dataset with uniformly-shaped measurements ready for JAX operations.
         """
+        global_max_len = max(
+            (
+                len(sp.data)
+                for meas in enzmldoc.measurements
+                for sp in meas.species_data
+                if sp.data is not None
+            ),
+            default=0,
+        )
 
-        missing_initial_conditions = []
-        measurements = []
+        missing_initial_conditions: list[str] = []
+        measurements: list[Measurement] = []
 
         for meas in enzmldoc.measurements:
             if any(sp.initial is not None for sp in meas.species_data):
-                measurements.append(Measurement.from_enzymeml(meas))
+                measurements.append(Measurement.from_enzymeml(meas, global_max_len))
             else:
                 missing_initial_conditions.append(meas.id)
 
@@ -518,17 +535,14 @@ def from_enzymeml(
         small_molecules = [sp.id for sp in enzmldoc.small_molecules]
         proteins = [sp.id for sp in enzmldoc.proteins]
         complexes = [sp.id for sp in enzmldoc.complexes]
-        all_states = small_molecules + proteins + complexes
 
-        dataset = cls(
+        return cls(
             id=enzmldoc.name,
             name=enzmldoc.name,
-            states=all_states,
+            states=small_molecules + proteins + complexes,
             measurements=measurements,
         )
 
-        return dataset
-
     @classmethod
     def from_dataframe(
         cls,
diff --git a/catalax/dataset/measurement.py b/catalax/dataset/measurement.py
@@ -390,41 +390,135 @@ def from_dataframe(
             **kwargs,
         )
 
+    @staticmethod
+    def _pad_species_arrays(
+        measurement: pe.Measurement,
+        global_max_len: int,
+    ) -> tuple[jax.Array, dict[str, jax.Array]]:
+        """Normalize species data onto a single canonical time axis.
+
+        EnzymeML species within the same measurement may be sampled at different
+        time points (different lengths and/or different values).  This method:
+
+        1. Selects the **longest** species time array as the canonical (unified)
+           time axis for this measurement.
+        2. **Validates** that every shorter species time array is a subset of the
+           canonical axis (all their time points appear in the canonical array,
+           within floating-point tolerance ``atol=1e-10``).
+        3. **Position-aligns** each species' data onto the canonical axis, placing
+           ``NaN`` at canonical time positions where that species was not sampled.
+        4. **Extends** the canonical time axis to ``global_max_len`` using
+           monotonic continuation (``+1.0`` per step) and pads all data arrays
+           with ``NaN``.  This cross-measurement normalisation ensures all
+           ``ctx.Measurement`` objects in a ``Dataset`` share the same array
+           length.
+
+        Args:
+            measurement: A pyenzyme ``Measurement`` whose species may have
+                heterogeneous time/data array lengths.
+            global_max_len: Target length for all output arrays.  Determined by
+                ``Dataset.from_enzymeml()`` across all measurements in the
+                document.
+
+        Returns:
+            ``(time, data_dict)`` where ``time`` is a 1-D JAX array of length
+            ``global_max_len`` and every value in ``data_dict`` is a 1-D JAX
+            array of the same length.
+
+        Raises:
+            ValueError: If any species' time array contains values not present
+                in the canonical time array (i.e., it is not a subset).
+        """
+        non_empty = [
+            sp for sp in measurement.species_data
+            if sp.data is not None and len(sp.data) > 0
+        ]
+
+        # 1. Select canonical time array (longest).
+        time_candidates = [
+            sp.time for sp in non_empty if sp.time is not None and len(sp.time) > 0
+        ]
+        if time_candidates:
+            canonical_list: list[float] = max(time_candidates, key=len)
+        else:
+            # No species has time data; synthesise a zero-based integer axis.
+            max_data_len = max((len(sp.data) for sp in non_empty), default=0)
+            canonical_list = list(range(max_data_len))
+
+        canonical_np = np.array(canonical_list, dtype=float)
+
+        # 2 & 3. Validate subset + position-align each species.
+        data_dict: dict[str, jax.Array] = {}
+        for sp in non_empty:
+            sp_data_np = np.array(sp.data, dtype=float)
+
+            if sp.time is None or len(sp.time) == 0:
+                # No time info — align to the start of the canonical axis.
+                aligned = np.full(len(canonical_np), np.nan)
+                aligned[: len(sp_data_np)] = sp_data_np
+            elif len(sp.time) == len(canonical_np) and np.allclose(
+                sp.time, canonical_np, atol=1e-10
+            ):
+                # Same axis as canonical — no alignment needed.
+                aligned = sp_data_np
+            else:
+                sp_time_np = np.array(sp.time, dtype=float)
+                aligned = np.full(len(canonical_np), np.nan)
+                for j, (t, v) in enumerate(zip(sp_time_np, sp_data_np)):
+                    idx = np.where(np.isclose(canonical_np, t, atol=1e-10))[0]
+                    if idx.size == 0:
+                        raise ValueError(
+                            f"Time point {t!r} of species '{sp.species_id}' "
+                            f"(measurement '{measurement.id}') not found in the "
+                            f"canonical time array {canonical_list!r}. All species "
+                            "time arrays must be subsets of the longest time array "
+                            "within the same measurement."
+                        )
+                    aligned[idx[0]] = v
+
+            data_dict[sp.species_id] = jnp.array(aligned)
+
+        # 4. Extend canonical time to global_max_len.
+        pad_len = global_max_len - len(canonical_np)
+        if pad_len > 0:
+            start = canonical_np[-1] + 1.0 if canonical_np.size > 0 else 0.0
+            extension = np.arange(start, start + pad_len, 1.0)
+            canonical_np = np.concatenate([canonical_np, extension])
+            data_dict = {
+                sid: jnp.concatenate([arr, jnp.full(pad_len, jnp.nan)])
+                for sid, arr in data_dict.items()
+            }
+
+        return jnp.array(canonical_np), data_dict
+
     @classmethod
-    def from_enzymeml(cls, measurement: pe.Measurement) -> "Measurement":
-        """Create a Measurement object from a pyenzyme Measurement object.
+    def from_enzymeml(
+        cls,
+        measurement: pe.Measurement,
+        global_max_len: int,
+    ) -> "Measurement":
+        """Create a Measurement from a pyenzyme Measurement object.
+
+        Delegates array normalisation to ``_pad_species_arrays``, which selects
+        the longest species time array as the unified time axis, validates that
+        shorter species time arrays are subsets of it, position-aligns their data
+        (``NaN`` at unsampled positions), and pads all arrays to ``global_max_len``.
 
         Args:
             measurement (pe.Measurement): PyEnzyme measurement object.
+            global_max_len (int): Common length for all output arrays; determined
+                by ``Dataset.from_enzymeml()`` across the entire document.
 
         Returns:
-            Measurement: New Measurement object with data from the PyEnzyme measurement.
+            Measurement: New Measurement with a single unified time axis and
+            NaN-padded data arrays.
         """
         initials = {
-            species.species_id: species.initial
-            for species in measurement.species_data
-            if species.initial is not None
+            sp.species_id: sp.initial
+            for sp in measurement.species_data
+            if sp.initial is not None
         }
-
-        data = {
-            species.species_id: jnp.array(species.data)
-            for species in measurement.species_data
-            if species.data is not None and len(species.data) > 0
-        }
-
-        time = next(
-            iter(
-                [
-                    jnp.array(data.time)
-                    for data in measurement.species_data
-                    if data.time is not None and len(data.time) > 0
-                ]
-            ),
-            None,
-        )
-
-        if measurement.id is None:
-            measurement.id = str(uuid4())
+        time, data = cls._pad_species_arrays(measurement, global_max_len)
 
         return cls(
             initial_conditions=initials,
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "catalax"
-version = "0.5.4"
+version = "0.5.5"
 description = "A JAX-based framework for (neural) ODE modelling in biocatalysis."
 authors = [{ email = "jan.range@simtech.uni-stuttgart.de", name = "Jan Range" }]
 license = "MIT"
diff --git a/tests/unit/dataset/test_dataset_enzymeml.py b/tests/unit/dataset/test_dataset_enzymeml.py
@@ -89,3 +89,44 @@ def test_dataset_from_enzymeml(self):
         assert len(ds.measurements) == len(doc.measurements)
         assert len(ds.states) == len(doc.small_molecules) + len(doc.proteins)
         assert len(ds.measurements) == len(doc.measurements)
+
+    def test_inhomogeneous_species_no_error(self):
+        """Different-length species arrays within one measurement must not raise ValidationError."""
+        doc = pe.EnzymeMLDocument(name="test")
+        s1 = doc.add_to_small_molecules(id="s1", name="s1")
+        s2 = doc.add_to_small_molecules(id="s2", name="s2")
+
+        meas = doc.add_to_measurements(id="m1", name="m1")
+        meas.add_to_species_data(
+            species_id=s1.id, name="s1", initial=10.0,
+            data=[10, 8, 6, 4, 2], time=[0, 1, 2, 3, 4],
+        )
+        meas.add_to_species_data(
+            species_id=s2.id, name="s2", initial=0.0,
+            data=[0, 4, 8], time=[0, 2, 4],
+        )
+
+        ds = ctx.Dataset.from_enzymeml(doc)  # must not raise
+        m = ds.measurements[0]
+        assert len(m.time) == 5
+        assert len(m.data["s1"]) == len(m.data["s2"]) == 5
+
+    def test_multiple_measurements_uniform_length(self):
+        """Measurements of different lengths must be padded to global_max_len."""
+        doc = pe.EnzymeMLDocument(name="test")
+        s1 = doc.add_to_small_molecules(id="s1", name="s1")
+
+        m_short = doc.add_to_measurements(id="m_short", name="m_short")
+        m_short.add_to_species_data(
+            species_id=s1.id, name="s1", initial=10.0,
+            data=[10, 8, 6], time=[0, 1, 2],
+        )
+        m_long = doc.add_to_measurements(id="m_long", name="m_long")
+        m_long.add_to_species_data(
+            species_id=s1.id, name="s1", initial=5.0,
+            data=[5, 4, 3, 2, 1], time=[0, 1, 2, 3, 4],
+        )
+
+        ds = ctx.Dataset.from_enzymeml(doc)
+        lengths = {len(m.time) for m in ds.measurements}
+        assert lengths == {5}, f"Expected uniform length 5, got {lengths}"
diff --git a/tests/unit/dataset/test_measurement_from_enzymeml.py b/tests/unit/dataset/test_measurement_from_enzymeml.py
@@ -0,0 +1,103 @@
+"""Tests for Measurement._pad_species_arrays() and from_enzymeml()."""
+import jax.numpy as jnp
+import pyenzyme as pe
+import pytest
+
+from catalax.dataset.measurement import Measurement
+
+
+def _make_pe_measurement(species: list[dict]) -> pe.Measurement:
+    """Build a pe.Measurement with multiple species_data entries.
+
+    Each entry in ``species`` is a dict with keys:
+        species_id, initial, data, time (all required).
+    """
+    doc = pe.EnzymeMLDocument(name="test")
+    meas = doc.add_to_measurements(id="m1", name="m1")
+    for sp in species:
+        sm = doc.add_to_small_molecules(id=sp["species_id"], name=sp["species_id"])
+        meas.add_to_species_data(
+            species_id=sm.id,
+            name=sp["species_id"],
+            initial=sp["initial"],
+            data=sp["data"],
+            time=sp["time"],
+        )
+    return meas
+
+
+class TestPadSpeciesArrays:
+    def test_homogeneous_no_padding_needed(self):
+        """When all species already have the same length, arrays are unchanged."""
+        meas = _make_pe_measurement([
+            {"species_id": "s1", "initial": 10.0, "data": [10, 8, 6], "time": [0, 1, 2]},
+            {"species_id": "s2", "initial": 0.0,  "data": [0, 2, 4],  "time": [0, 1, 2]},
+        ])
+        time, data = Measurement._pad_species_arrays(meas, global_max_len=3)
+        assert len(time) == 3
+        assert len(data["s1"]) == len(data["s2"]) == 3
+        assert not jnp.any(jnp.isnan(data["s1"]))
+        assert not jnp.any(jnp.isnan(data["s2"]))
+
+    def test_subset_species_aligned_with_nan(self):
+        """Shorter species (subset time) get NaN at positions they were not measured."""
+        # s1: [0,1,2,3,4] — canonical (longest)
+        # s2: [0,2,4]     — subset, measured at every other point
+        meas = _make_pe_measurement([
+            {"species_id": "s1", "initial": 10.0, "data": [10, 8, 6, 4, 2], "time": [0, 1, 2, 3, 4]},
+            {"species_id": "s2", "initial": 0.0,  "data": [0, 4, 8],        "time": [0, 2, 4]},
+        ])
+        time, data = Measurement._pad_species_arrays(meas, global_max_len=5)
+        assert list(time) == [0, 1, 2, 3, 4]
+        assert float(data["s2"][0]) == 0.0
+        assert jnp.isnan(data["s2"][1])      # t=1 not measured
+        assert float(data["s2"][2]) == 4.0
+        assert jnp.isnan(data["s2"][3])      # t=3 not measured
+        assert float(data["s2"][4]) == 8.0
+
+    def test_cross_measurement_padding_to_global_max(self):
+        """When global_max_len > local canonical length, time and data are extended."""
+        meas = _make_pe_measurement([
+            {"species_id": "s1", "initial": 10.0, "data": [10, 8, 6], "time": [0, 1, 2]},
+        ])
+        time, data = Measurement._pad_species_arrays(meas, global_max_len=5)
+        assert len(time) == 5
+        assert float(time[3]) == 3.0      # monotonic continuation
+        assert float(time[4]) == 4.0
+        assert jnp.isnan(data["s1"][3])   # data padded with NaN
+        assert jnp.isnan(data["s1"][4])
+
+    def test_raises_when_species_time_not_subset(self):
+        """Raises ValueError when a species has time points outside the canonical axis."""
+        meas = _make_pe_measurement([
+            {"species_id": "s1", "initial": 10.0, "data": [10, 8, 6, 4],  "time": [0, 1, 2, 3]},
+            {"species_id": "s2", "initial": 0.0,  "data": [0, 2, 4],      "time": [0, 1.5, 3]},
+            # t=1.5 is NOT in s1's time array -> should raise
+        ])
+        with pytest.raises(ValueError, match="not found in the canonical time"):
+            Measurement._pad_species_arrays(meas, global_max_len=4)
+
+
+class TestMeasurementFromEnzymeML:
+    def test_homogeneous_roundtrip(self):
+        """Homogeneous species (same time arrays) round-trip without NaN."""
+        meas = _make_pe_measurement([
+            {"species_id": "s1", "initial": 10.0, "data": [10, 8, 6], "time": [0, 1, 2]},
+            {"species_id": "s2", "initial": 0.0,  "data": [0, 2, 4],  "time": [0, 1, 2]},
+        ])
+        m = Measurement.from_enzymeml(meas, global_max_len=3)
+        assert list(m.time) == [0, 1, 2]
+        assert not jnp.any(jnp.isnan(m.data["s1"]))
+        assert not jnp.any(jnp.isnan(m.data["s2"]))
+
+    def test_inhomogeneous_no_validation_error(self):
+        """The original bug: inhomogeneous lengths must not raise ValidationError."""
+        meas = _make_pe_measurement([
+            {"species_id": "s1", "initial": 10.0, "data": [10, 8, 6, 4, 2], "time": [0, 1, 2, 3, 4]},
+            {"species_id": "s2", "initial": 0.0,  "data": [0, 4, 8],        "time": [0, 2, 4]},
+        ])
+        m = Measurement.from_enzymeml(meas, global_max_len=5)  # must not raise
+        assert len(m.time) == 5
+        assert len(m.data["s1"]) == len(m.data["s2"]) == 5
+        assert jnp.isnan(m.data["s2"][1])
+        assert jnp.isnan(m.data["s2"][3])

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`"from_enzymeml",`
`23`	`23`	`]`
`24`	`24`
`25`		`-__version__ = "0.5.4"`
	`25`	`+__version__ = "0.5.5"`
`26`	`26`
`27`	`27`	`PARAMETERS = InAxes.PARAMETERS`
`28`	`28`	`TIME = InAxes.TIME`