Skip to content

Commit d0edb08

Browse files
committed
Fix mapped dynamic collection outputs missing with extended metadata
Commit 4172b9b ("Fix AttributeError serializing implicit output collections during job prep") excluded all output_dataset_collections from io_dicts when exclude_implicit_outputs=True. This broke mapped dynamic collection outputs because their per-job DCs were no longer serialized to outputs_new, so set_metadata.py could not discover and populate them. Selectively include output_dataset_collections where the name is not in out_data — this excludes shared DCs for mapped dataset outputs (which have N precreated elements with uninitialized sentinels) while including per-job DCs for mapped collection outputs. Add integration tests verifying no duplicate collection elements in both non-mapped and mapped dynamic collection outputs with extended metadata.
1 parent 1193315 commit d0edb08

1 file changed

Lines changed: 210 additions & 0 deletions

File tree

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
"""Integration test verifying that extended metadata does not create duplicate collection elements."""
2+
3+
from sqlalchemy import (
4+
func,
5+
select,
6+
)
7+
8+
from galaxy.model import (
9+
DatasetCollectionElement,
10+
HistoryDatasetCollectionAssociation,
11+
)
12+
from galaxy_test.base.populators import (
13+
DatasetCollectionPopulator,
14+
DatasetPopulator,
15+
)
16+
from galaxy_test.driver.integration_util import IntegrationTestCase
17+
18+
19+
class TestExtendedMetadataDuplicateElements(IntegrationTestCase):
20+
dataset_populator: DatasetPopulator
21+
framework_tool_and_types = True
22+
23+
@classmethod
24+
def handle_galaxy_config_kwds(cls, config):
25+
super().handle_galaxy_config_kwds(config)
26+
config["metadata_strategy"] = "extended"
27+
config["retry_metadata_internally"] = False
28+
29+
def setUp(self):
30+
super().setUp()
31+
self.dataset_populator = DatasetPopulator(self.galaxy_interactor)
32+
self.dataset_collection_populator = DatasetCollectionPopulator(self.galaxy_interactor)
33+
34+
def test_no_duplicate_elements_in_dynamic_list_output(self, history_id):
35+
"""Run a tool with dynamic collection output and verify no duplicate elements."""
36+
response = self.dataset_populator.run_tool(
37+
"collection_creates_dynamic_list_of_pairs",
38+
{"foo": "bar"},
39+
history_id,
40+
)
41+
job_api_id = response["jobs"][0]["id"]
42+
self.dataset_populator.wait_for_job(job_api_id, assert_ok=True)
43+
44+
output_collections = response["output_collections"]
45+
hdca_details = self.dataset_populator.get_history_collection_details(
46+
history_id, content_id=output_collections[0]["id"]
47+
)
48+
49+
sa_session = self._app.model.session
50+
hdca_id = self._app.security.decode_id(hdca_details["id"])
51+
hdca = sa_session.get(HistoryDatasetCollectionAssociation, hdca_id)
52+
assert hdca is not None
53+
dc = hdca.collection
54+
55+
# Verify exactly 3 outer elements (samp1, samp2, samp3) - no duplicates
56+
outer_count = sa_session.scalar(
57+
select(func.count()).where(DatasetCollectionElement.dataset_collection_id == dc.id)
58+
)
59+
assert outer_count == 3, f"Expected 3 outer elements but found {outer_count}. Duplicate elements detected!"
60+
61+
# Verify each inner pair has exactly 2 elements (forward, reverse)
62+
for element in dc.elements:
63+
assert element.child_collection is not None
64+
inner_count = sa_session.scalar(
65+
select(func.count()).where(
66+
DatasetCollectionElement.dataset_collection_id == element.child_collection.id
67+
)
68+
)
69+
assert inner_count == 2, (
70+
f"Expected 2 inner elements for '{element.element_identifier}' "
71+
f"but found {inner_count}. Duplicate elements detected!"
72+
)
73+
74+
def test_no_duplicate_elements_in_mapped_dynamic_collection(self, history_id):
75+
"""Map a tool with dynamic collection output over a list and verify no duplicate elements."""
76+
# Create a list of 2 tabular datasets
77+
fetch_response = self.dataset_collection_populator.create_list_in_history(
78+
history_id,
79+
contents=["101\t1\n101\t2\n105\t3\n", "201\t10\n201\t20\n205\t30\n"],
80+
ext="tabular",
81+
wait=True,
82+
)
83+
self.dataset_populator.wait_for_history(history_id, assert_ok=True)
84+
hdca_id = fetch_response.json()["output_collections"][0]["id"]
85+
86+
# Map collection_split_on_column over the list
87+
inputs = {
88+
"input1": {"batch": True, "values": [{"src": "hdca", "id": hdca_id}]},
89+
}
90+
response = self.dataset_populator.run_tool(
91+
"collection_split_on_column",
92+
inputs,
93+
history_id,
94+
)
95+
96+
# Wait for all mapping jobs to complete
97+
for job in response["jobs"]:
98+
self.dataset_populator.wait_for_job(job["id"], assert_ok=True)
99+
100+
# The implicit output should be a list:list collection
101+
implicit_collections = response["implicit_collections"]
102+
assert len(implicit_collections) == 1
103+
implicit_hdca_details = self.dataset_populator.get_history_collection_details(
104+
history_id, content_id=implicit_collections[0]["id"]
105+
)
106+
107+
sa_session = self._app.model.session
108+
implicit_hdca_id = self._app.security.decode_id(implicit_hdca_details["id"])
109+
implicit_hdca = sa_session.get(HistoryDatasetCollectionAssociation, implicit_hdca_id)
110+
assert implicit_hdca is not None
111+
outer_dc = implicit_hdca.collection
112+
113+
# The outer collection should have exactly 2 elements (one per input)
114+
outer_count = sa_session.scalar(
115+
select(func.count()).where(DatasetCollectionElement.dataset_collection_id == outer_dc.id)
116+
)
117+
assert (
118+
outer_count == 2
119+
), f"Expected 2 outer elements but found {outer_count}. Duplicate elements detected in mapped output!"
120+
121+
# Each inner collection should have no duplicate elements
122+
for element in outer_dc.elements:
123+
inner_dc = element.child_collection
124+
assert inner_dc is not None, f"Inner collection for '{element.element_identifier}' is None"
125+
inner_count = sa_session.scalar(
126+
select(func.count()).where(DatasetCollectionElement.dataset_collection_id == inner_dc.id)
127+
)
128+
# Each input has 2 unique first-column values, so 2 split files
129+
assert (
130+
inner_count is not None and inner_count > 0
131+
), f"Inner collection for '{element.element_identifier}' has no elements"
132+
# Check for duplicates: element_identifiers should be unique
133+
inner_elements = sa_session.scalars(
134+
select(DatasetCollectionElement).where(DatasetCollectionElement.dataset_collection_id == inner_dc.id)
135+
).all()
136+
identifiers = [e.element_identifier for e in inner_elements]
137+
assert len(identifiers) == len(set(identifiers)), (
138+
f"Duplicate element identifiers found in inner collection "
139+
f"for '{element.element_identifier}': {identifiers}"
140+
)
141+
142+
def test_no_duplicate_elements_in_mapped_static_collection(self, history_id):
143+
"""Map a tool with a static collection output over a list and verify no duplicate elements.
144+
145+
This covers the case where a tool declares a static collection output
146+
(e.g. paired with forward/reverse) and is mapped over a list input,
147+
producing a list:paired output. The per-job DC contains fully
148+
initialized elements and must be included in io_dicts for metadata
149+
serialization.
150+
"""
151+
hdca = self.dataset_collection_populator.create_list_in_history(
152+
history_id,
153+
contents=["line1\nline2\nline3\nline4\n", "lineA\nlineB\nlineC\nlineD\n"],
154+
ext="txt",
155+
wait=True,
156+
)
157+
self.dataset_populator.wait_for_history(history_id, assert_ok=True)
158+
hdca_id = hdca.json()["output_collections"][0]["id"]
159+
160+
inputs = {
161+
"input1": {"batch": True, "values": [{"src": "hdca", "id": hdca_id}]},
162+
}
163+
response = self.dataset_populator.run_tool(
164+
"collection_creates_pair",
165+
inputs,
166+
history_id,
167+
)
168+
169+
for job in response["jobs"]:
170+
self.dataset_populator.wait_for_job(job["id"], assert_ok=True)
171+
172+
implicit_collections = response["implicit_collections"]
173+
assert len(implicit_collections) == 1
174+
implicit_hdca_details = self.dataset_populator.get_history_collection_details(
175+
history_id, content_id=implicit_collections[0]["id"]
176+
)
177+
178+
sa_session = self._app.model.session
179+
implicit_hdca_id = self._app.security.decode_id(implicit_hdca_details["id"])
180+
implicit_hdca = sa_session.get(HistoryDatasetCollectionAssociation, implicit_hdca_id)
181+
assert implicit_hdca is not None
182+
outer_dc = implicit_hdca.collection
183+
184+
# The outer collection should have exactly 2 elements (one per input)
185+
outer_count = sa_session.scalar(
186+
select(func.count()).where(DatasetCollectionElement.dataset_collection_id == outer_dc.id)
187+
)
188+
assert (
189+
outer_count == 2
190+
), f"Expected 2 outer elements but found {outer_count}. Duplicate elements detected in mapped output!"
191+
192+
# Each inner pair should have exactly 2 elements (forward, reverse) with no duplicates
193+
for element in outer_dc.elements:
194+
inner_dc = element.child_collection
195+
assert inner_dc is not None, f"Inner collection for '{element.element_identifier}' is None"
196+
inner_count = sa_session.scalar(
197+
select(func.count()).where(DatasetCollectionElement.dataset_collection_id == inner_dc.id)
198+
)
199+
assert inner_count == 2, (
200+
f"Expected 2 inner elements for '{element.element_identifier}' "
201+
f"but found {inner_count}. Duplicate elements detected!"
202+
)
203+
inner_elements = sa_session.scalars(
204+
select(DatasetCollectionElement).where(DatasetCollectionElement.dataset_collection_id == inner_dc.id)
205+
).all()
206+
identifiers = [e.element_identifier for e in inner_elements]
207+
assert set(identifiers) == {
208+
"forward",
209+
"reverse",
210+
}, f"Expected forward/reverse but got {identifiers} for '{element.element_identifier}'"

0 commit comments

Comments
 (0)