Raise ToolInputsNotReady for unpopulated structured_like target

mvdbeek · mvdbeek · commit 1394d393ae74 · 2026-04-30T13:03:03.000+02:00
When a tool output is declared ``structured_like=&lt;non-mapped collection
input&gt;`` and the user maps the tool over an empty collection,
param_combinations is empty and ExecutionTracker.example_params falls
back to the raw param_template. The non-mapped collection's batch
wrapper there is never substituted with an HDCA, so
sliced_input_collection_structure crashes with
"Referenced input parameter is not a collection." The real trigger
observed in production was an upstream collection that had not finished
populating yet.

Move the representative-params logic onto MappingParameters so the
fallback branch resolves batch wrappers (and bare {src,id} refs) to
HDCA/DCE ORM objects, and raise ToolInputsNotReadyException when the
referenced collection's populated_optimized is False. The scheduler
already handles this exception by retrying, matching the behavior
__expand_collection_parameter has for mapped-over HDCAs.

Also switch sliced_input_collection_structure to derive the
collection_type via get_collection(input_collection) so the DCE path
stays consistent with the existing get_collection call that follows.

Fixes GALAXY-MAIN-4KSCZZZ0015NC.
diff --git a/lib/galaxy/tools/execute.py b/lib/galaxy/tools/execute.py
@@ -21,7 +21,10 @@
 from packaging.version import Version
 
 from galaxy import model
-from galaxy.exceptions import ToolInputsNotOKException
+from galaxy.exceptions import (
+    ToolInputsNotOKException,
+    ToolInputsNotReadyException,
+)
 from galaxy.model import ToolRequest
 from galaxy.model.dataset_collections.matching import MatchingCollections
 from galaxy.model.dataset_collections.structure import (
@@ -41,6 +44,7 @@
     ToolExecutionCache,
 )
 from galaxy.tools.parameters.workflow_utils import is_runtime_value
+from galaxy.work.context import WorkRequestContext
 from ._types import (
     ToolRequestT,
     ToolStateJobInstancePopulatedT,
@@ -87,6 +91,71 @@ def ensure_validated(self):
         assert self.validated_param_template is not None
         assert self.validated_param_combinations is not None
 
+    def example_params(self, trans: WorkRequestContext) -> ToolStateJobInstancePopulatedT:
+        """Representative per-job params for output-structure determination.
+
+        Normally returns ``param_combinations[0]``. When the request
+        produces zero jobs (e.g. mapping over an empty collection),
+        falls back to a resolved copy of ``param_template``: batch
+        wrappers and raw ``{"src", "id"}`` refs are replaced with
+        HDCA/DCE ORM objects. Raises :class:`ToolInputsNotReadyException`
+        if a referenced collection exists but is not populated yet, so
+        the scheduler retries instead of surfacing the cryptic
+        "Referenced input parameter is not a collection." error.
+        """
+        if self.param_combinations:
+            return self.param_combinations[0]
+        return _resolve_template(self.param_template, trans)
+
+
+def _resolve_template(template: ToolRequestT, trans: WorkRequestContext) -> ToolStateJobInstancePopulatedT:
+    return {key: _resolve_template_value(value, trans) for key, value in template.items()}
+
+
+def _resolve_template_value(value: Any, trans: WorkRequestContext) -> Any:
+    if isinstance(value, dict):
+        values = value.get("values")
+        if (
+            isinstance(values, list)
+            and values
+            and isinstance(values[0], dict)
+            and "src" in values[0]
+            and "id" in values[0]
+        ):
+            return _resolve_collection_ref(values[0], trans, raw_fallback=value)
+        if "src" in value and "id" in value:
+            return _resolve_collection_ref(value, trans, raw_fallback=value)
+        return {k: _resolve_template_value(v, trans) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_resolve_template_value(v, trans) for v in value]
+    return value
+
+
+def _resolve_collection_ref(
+    ref: dict[str, Any],
+    trans: WorkRequestContext,
+    raw_fallback: Any,
+) -> Union[model.HistoryDatasetCollectionAssociation, model.DatasetCollectionElement, Any]:
+    src = ref.get("src")
+    rid = ref.get("id")
+    if rid is None or src not in ("hdca", "dce"):
+        return raw_fallback
+    decoded = rid if isinstance(rid, int) else trans.security.decode_id(rid)
+    sa_session = trans.sa_session
+    if src == "hdca":
+        hdca = sa_session.get(model.HistoryDatasetCollectionAssociation, decoded)
+        if hdca is None:
+            return raw_fallback
+        if not hdca.collection.populated_optimized:
+            raise ToolInputsNotReadyException("An input collection is not populated.")
+        return hdca
+    dce = sa_session.get(model.DatasetCollectionElement, decoded)
+    if dce is None or dce.child_collection is None:
+        return raw_fallback
+    if not dce.child_collection.populated_optimized:
+        raise ToolInputsNotReadyException("An input collection is not populated.")
+    return dce
+
 
 def execute_async(
     trans,
@@ -416,13 +485,7 @@ def param_combinations(self) -> list[ToolStateJobInstancePopulatedT]:
 
     @property
     def example_params(self):
-        if self.mapping_params.param_combinations:
-            return self.mapping_params.param_combinations[0]
-        else:
-            # TODO: This isn't quite right - what we want is something like param_template wrapped,
-            # need a test case with an output filter applied to an empty list, still this is
-            # an improvement over not allowing mapping of empty lists.
-            return self.mapping_params.param_template
+        return self.mapping_params.example_params(self.trans)
 
     @property
     def job_count(self):
@@ -510,7 +573,7 @@ def find_collection(input_dict, input_name, path_prefix=""):
 
         collection_type_description = (
             self.trans.app.dataset_collection_manager.collection_type_descriptions.for_collection_type(
-                input_collection.collection.collection_type
+                get_collection(input_collection).collection_type
             )
         )
         subcollection_mapping_type = None
diff --git a/lib/galaxy_test/api/test_tool_execute.py b/lib/galaxy_test/api/test_tool_execute.py
@@ -256,6 +256,27 @@ def test_map_over_empty_collection(target_history: TargetHistory, required_tool:
     assert "on collection 1" in name
 
 
+@requires_tool_id("collection_mapped_over_empty_structured_like")
+def test_map_over_empty_with_structured_like_non_mapped_collection_input(
+    target_history: TargetHistory, required_tool: RequiredTool
+):
+    # Regression guard: an output declared ``structured_like=<non-mapped
+    # collection input>`` must precreate an implicit output even when the
+    # mapped-over input is empty (zero jobs). Before the fix,
+    # example_params fell back to param_template where the non-mapped
+    # collection's batch wrapper was never substituted, and precreate
+    # crashed with "Referenced input parameter is not a collection."
+    empty_hdca = target_history.with_list([])
+    shape_hdca = target_history.with_pair(["a", "b"])
+    inputs = {
+        "input1": {"batch": True, "values": [empty_hdca.src_dict]},
+        "shape": shape_hdca.src_dict,
+    }
+    execute = required_tool.execute().with_inputs(inputs)
+    execute.assert_has_n_jobs(0)
+    execute.assert_creates_implicit_collection(0)
+
+
 @dataclass
 class MultiRunInRepeatFixtures:
     repeat_datasets: list[SrcDict]
diff --git a/test/functional/tools/collection_mapped_over_empty_structured_like.xml b/test/functional/tools/collection_mapped_over_empty_structured_like.xml
@@ -0,0 +1,13 @@
+<tool id="collection_mapped_over_empty_structured_like" name="collection_mapped_over_empty_structured_like" version="0.1.0">
+  <command><![CDATA[
+        cat '$input1' '${shape.forward}' '${shape.reverse}' > '${list_output.forward}';
+        cat '$input1' '${shape.reverse}' '${shape.forward}' > '${list_output.reverse}'
+  ]]></command>
+  <inputs>
+    <param name="input1" type="data" format="txt" label="Mapped-over data input" />
+    <param name="shape" type="data_collection" collection_type="paired" format="txt" label="Structure source" />
+  </inputs>
+  <outputs>
+    <collection name="list_output" structured_like="shape" type="paired" inherit_format="true" />
+  </outputs>
+</tool>
diff --git a/test/functional/tools/sample_tool_conf.xml b/test/functional/tools/sample_tool_conf.xml
@@ -189,6 +189,7 @@
   <tool file="collection_paired_default.xml" />
   <tool file="collection_paired_structured_like.xml" />
   <tool file="collection_paired_conditional_structured_like.xml" />
+  <tool file="collection_mapped_over_empty_structured_like.xml" />
   <tool file="collection_nested_test.xml" />
   <tool file="collection_nested_default.xml" />
   <tool file="collection_mixed_param.xml" />
diff --git a/test/integration/test_structured_like_unpopulated.py b/test/integration/test_structured_like_unpopulated.py
@@ -0,0 +1,93 @@
+"""Integration test for ToolInputsNotReady on structured_like/unpopulated input.
+
+When a tool output is ``structured_like="<non-mapped collection input>"`` and
+the user maps the tool over an empty collection, implicit output collection
+precreation consults that input to determine output shape. If the referenced
+collection is still populating, we should raise ``ToolInputsNotReadyException``
+(HTTP 400, ``TOOL_INPUTS_NOT_READY``) rather than surfacing the cryptic
+"Referenced input parameter is not a collection." (Sentry issue GALAXY-MAIN-4KSCZZZ0015NC).
+
+This test deterministically produces an unpopulated DatasetCollection by
+downgrading ``populated_state`` directly in the DB after the collection has
+been created via the standard fetch path — the only reliable way to simulate
+the race-window state from pure API tests.
+"""
+
+from sqlalchemy import select
+
+from galaxy.model import (
+    DatasetCollection,
+    HistoryDatasetCollectionAssociation,
+)
+from galaxy_test.base.populators import (
+    DatasetCollectionPopulator,
+    DatasetPopulator,
+)
+from galaxy_test.driver import integration_util
+
+
+class TestStructuredLikeUnpopulatedRaisesNotReady(integration_util.IntegrationTestCase):
+    framework_tool_and_types = True
+
+    dataset_populator: DatasetPopulator
+    dataset_collection_populator: DatasetCollectionPopulator
+    require_admin_user = True
+
+    def setUp(self):
+        super().setUp()
+        self.dataset_populator = DatasetPopulator(self.galaxy_interactor)
+        self.dataset_collection_populator = DatasetCollectionPopulator(self.galaxy_interactor)
+
+    @property
+    def sa_session(self):
+        return self._app.model.session
+
+    def _mark_collection_unpopulated(self, hdca_id: str) -> None:
+        hdca_db_id = self._get(f"configuration/decode/{hdca_id}").json()["decoded_id"]
+        # HDCA.collection_id points at the DatasetCollection row we need.
+        hdca_model = self.sa_session.scalar(
+            select(HistoryDatasetCollectionAssociation).where(HistoryDatasetCollectionAssociation.id == hdca_db_id)
+        )
+        assert hdca_model is not None
+        dc_model = hdca_model.collection
+        dc_model.populated_state = DatasetCollection.populated_states.NEW
+        self.sa_session.add(dc_model)
+        self.sa_session.commit()
+
+    def test_unpopulated_structured_like_target_raises_not_ready(self):
+        with self.dataset_populator.test_history() as history_id:
+            empty_hdca = self.dataset_collection_populator.create_list_in_history(
+                history_id, contents=[], direct_upload=True, wait=True
+            ).json()["output_collections"][0]
+
+            shape_response = self.dataset_collection_populator.create_pair_in_history(
+                history_id, contents=["a", "b"], direct_upload=True, wait=True
+            ).json()
+            shape_hdca = shape_response["output_collections"][0]
+
+            # Simulate upstream "still populating" — mirrors what
+            # happens when the referenced collection is an implicit
+            # collection whose producing jobs haven't finished yet.
+            self._mark_collection_unpopulated(shape_hdca["id"])
+
+            inputs = {
+                "input1": {"batch": True, "values": [{"src": "hdca", "id": empty_hdca["id"]}]},
+                "shape": {"src": "hdca", "id": shape_hdca["id"]},
+            }
+            response = self.dataset_populator.run_tool_raw(
+                tool_id="collection_mapped_over_empty_structured_like",
+                inputs=inputs,
+                history_id=history_id,
+            )
+
+            # Expect HTTP 400 (ToolInputsNotReadyException) with the
+            # same message meta.py raises for mapped-over unpopulated
+            # HDCAs, not the cryptic "Referenced input parameter..."
+            # that used to reach Sentry.
+            assert response.status_code == 400, (
+                f"Expected 400 for unpopulated input collection, got {response.status_code}: {response.text}"
+            )
+            assert "not populated" in response.text, f"Expected 'not populated' in error body, got: {response.text}"
+            assert (
+                "Referenced input parameter is not a collection" not in response.text
+            ), "Regression: old cryptic error surfaced instead of ToolInputsNotReady"