Skip to content

Commit 22b7bbb

Browse files
committed
Implement stages and filtering in IDC split script.
1 parent b0d0a0b commit 22b7bbb

2 files changed

Lines changed: 137 additions & 23 deletions

File tree

src/ephemeris/_idc_split_data_manager_genomes.py

Lines changed: 63 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,27 @@
4444
log = logging.getLogger(__name__)
4545

4646

47+
class Filters:
48+
stage: Optional[int] = None
49+
data_manager: Optional[str] = None
50+
build_id: Optional[str] = None
51+
52+
def filter_out_data_manager(self, data_manager: str) -> bool:
53+
return bool(self.data_manager and data_manager != self.data_manager)
54+
55+
def filter_out_build_id(self, build_id: str) -> bool:
56+
return bool(self.build_id and build_id != self.build_id)
57+
58+
def filter_out_stage(self, stage: int) -> bool:
59+
return bool(self.stage is not None and self.stage != stage)
60+
61+
4762
class SplitOptions:
4863
merged_genomes_path: str
4964
split_genomes_path: str
5065
data_managers_path: str
5166
is_build_complete: IsBuildComplete
67+
filters: Filters = Filters()
5268

5369

5470
def tool_id_for(indexer: str, data_managers: Dict[str, DataManager]) -> str:
@@ -94,34 +110,31 @@ def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str):
94110
yaml.safe_dump(run_data_managers.dict(exclude_unset=True), of)
95111

96112

97-
def split_genomes(split_options: SplitOptions) -> None:
98-
99-
def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: str):
100-
split_genomes_path = split_options.split_genomes_path
101-
if not os.path.exists(split_options.split_genomes_path):
102-
safe_makedirs(split_genomes_path)
103-
104-
task_file_dir = os.path.join(split_genomes_path, build_id, indexer)
105-
task_file = os.path.join(task_file_dir, TASK_FILE_NAME)
106-
write_run_data_manager_to_file(run_data_manager, task_file)
107-
113+
def walk_over_incomplete_runs(split_options: SplitOptions):
108114
data_managers = read_data_managers_configuration(split_options.data_managers_path)
109115
with open(split_options.merged_genomes_path) as f:
110116
genomes_all = yaml.safe_load(f)
111117
genomes = genomes_all["genomes"]
112118
for genome in genomes:
113119
build_id = genome["id"]
120+
if split_options.filters.filter_out_build_id(build_id):
121+
continue
114122

115123
fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta"
116-
if not split_options.is_build_complete(build_id, fetch_indexer):
124+
do_fetch = not split_options.filters.filter_out_data_manager(fetch_indexer)
125+
source = genome.get("source")
126+
if source is None:
127+
do_fetch = False
128+
if do_fetch and split_options.filters.filter_out_stage(0):
129+
do_fetch = False
130+
131+
if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer):
117132
log.info(f"Fetching: {build_id}")
118133
fetch_tool_id = tool_id_for(fetch_indexer, data_managers)
119134
fetch_params = []
120135
fetch_params.append({"dbkey_source|dbkey": genome["id"]})
121136
source = genome.get("source")
122-
if source is None:
123-
continue
124-
elif source == "ucsc":
137+
if source == "ucsc":
125138
fetch_params.append({"reference_source|reference_source_selector": "ucsc"})
126139
fetch_params.append({"reference_source|requested_dbkey": genome["id"]})
127140
fetch_params.append({"sequence_name": genome["description"]})
@@ -146,12 +159,18 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st
146159
# Not needed according to Marius
147160
# data_table_reload=["all_fasta", "__dbkeys__"],
148161
)
149-
write_task_file(fetch_run_data_manager, build_id, fetch_indexer)
162+
yield (build_id, fetch_indexer, fetch_run_data_manager)
150163
else:
151164
log.debug(f"Fetch is already completed: {build_id}")
152165

153166
indexers = genome.get("indexers", [])
154167
for indexer in indexers:
168+
if split_options.filters.filter_out_data_manager(indexer):
169+
continue
170+
171+
if split_options.filters.filter_out_stage(1):
172+
continue
173+
155174
if split_options.is_build_complete(build_id, indexer):
156175
log.debug(f"Build is already completed: {build_id} {indexer}")
157176
continue
@@ -179,7 +198,22 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st
179198
params=params,
180199
items=[item],
181200
)
182-
write_task_file(run_data_manager, build_id, indexer)
201+
yield (build_id, indexer, run_data_manager)
202+
203+
204+
def split_genomes(split_options: SplitOptions) -> None:
205+
206+
def write_task_file(build_id: str, indexer: str, run_data_manager: RunDataManager):
207+
split_genomes_path = split_options.split_genomes_path
208+
if not os.path.exists(split_options.split_genomes_path):
209+
safe_makedirs(split_genomes_path)
210+
211+
task_file_dir = os.path.join(split_genomes_path, build_id, indexer)
212+
task_file = os.path.join(task_file_dir, TASK_FILE_NAME)
213+
write_run_data_manager_to_file(run_data_manager, task_file)
214+
215+
for build_id, indexer, run_data_manager in walk_over_incomplete_runs(split_options):
216+
write_task_file(build_id, indexer, run_data_manager)
183217

184218

185219
class GalaxyHistoryIsBuildComplete:
@@ -199,6 +233,12 @@ def _parser():
199233
parser.add_argument('--merged-genomes-path', '-m', default="genomes.yml")
200234
parser.add_argument('--split-genomes-path', '-s', default="data_manager_tasks")
201235
parser.add_argument('--data-managers-path', default="data_managers.yml")
236+
237+
# filters
238+
parser.add_argument('--filter-stage', default=None)
239+
parser.add_argument('--filter-data-manager', default=None)
240+
parser.add_argument('--filter-build-id', default=None)
241+
202242
return parser
203243

204244

@@ -225,6 +265,12 @@ def main():
225265
split_options.split_genomes_path = args.split_genomes_path
226266
split_options.is_build_complete = is_build_complete
227267

268+
filters = Filters()
269+
filters.build_id = args.filter_build_id
270+
filters.data_manager = args.filter_data_manager
271+
filters.stage = args.filter_stage
272+
split_options.filters = filters
273+
228274
split_genomes(split_options)
229275

230276

tests/test_split_genomes.py

Lines changed: 74 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import yaml
44

55
from ephemeris._idc_split_data_manager_genomes import (
6+
Filters,
67
GalaxyHistoryIsBuildComplete,
78
RunDataManagers,
89
split_genomes,
@@ -62,19 +63,22 @@ def read_and_validate_run_data_manager_yaml(path):
6263
return RunDataManagers(**yaml.safe_load(f))
6364

6465

65-
def test_split_genomes(tmp_path: Path):
66-
setup_mock_idc_dir(tmp_path)
67-
68-
split_path = tmp_path / "split"
69-
66+
def split_options_for(tmp_path: Path) -> SplitOptions:
7067
history_names = ["idc-hg19_rCRS_pUC18_phiX174-data_manager_star_index_builder"]
7168
is_build_complete = GalaxyHistoryIsBuildComplete(history_names)
7269

7370
split_options = SplitOptions()
7471
split_options.merged_genomes_path = tmp_path / "genomes.yml"
75-
split_options.split_genomes_path = str(split_path)
72+
split_options.split_genomes_path = str(tmp_path / "split")
7673
split_options.data_managers_path = tmp_path / "data_managers.yml"
7774
split_options.is_build_complete = is_build_complete
75+
return split_options
76+
77+
78+
def test_split_genomes(tmp_path: Path):
79+
setup_mock_idc_dir(tmp_path)
80+
split_path = tmp_path / "split"
81+
split_options = split_options_for(tmp_path)
7882
split_genomes(split_options)
7983
new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder"
8084
complete_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_star_index_builder"
@@ -89,3 +93,67 @@ def test_split_genomes(tmp_path: Path):
8993
assert data_manager.id == "toolshed.g2.bx.psu.edu/repos/devteam/data_manager_twobit_builder/twobit_builder_data_manager/0.0.2"
9094
assert data_manager.items[0]["id"] == "hg19_rCRS_pUC18_phiX174"
9195
assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174"
96+
97+
98+
def test_split_genomes_filter_on_data_manager(tmp_path: Path):
99+
setup_mock_idc_dir(tmp_path)
100+
split_path = tmp_path / "split"
101+
split_options = split_options_for(tmp_path)
102+
filters = Filters()
103+
filters.data_manager = "data_manager_star_index_builder"
104+
split_options.filters = filters
105+
106+
split_genomes(split_options)
107+
new_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder"
108+
assert not new_task.exists()
109+
110+
filters.data_manager = "data_manager_twobit_builder"
111+
split_genomes(split_options)
112+
assert new_task.exists()
113+
114+
115+
def test_split_genomes_filter_on_build_id(tmp_path: Path):
116+
setup_mock_idc_dir(tmp_path)
117+
split_path = tmp_path / "split"
118+
split_options = split_options_for(tmp_path)
119+
filters = Filters()
120+
filters.build_id = "rn6"
121+
split_options.filters = filters
122+
123+
split_genomes(split_options)
124+
filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder"
125+
assert not filtered_out_task.exists()
126+
127+
filtered_in_task = split_path / "rn6" / "data_manager_twobit_builder"
128+
assert filtered_in_task.exists()
129+
130+
131+
def test_split_genomes_filter_on_stage_0(tmp_path: Path):
132+
setup_mock_idc_dir(tmp_path)
133+
split_path = tmp_path / "split"
134+
split_options = split_options_for(tmp_path)
135+
filters = Filters()
136+
filters.stage = 0
137+
split_options.filters = filters
138+
139+
split_genomes(split_options)
140+
filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder"
141+
assert not filtered_out_task.exists()
142+
143+
filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta"
144+
assert filtered_in_task.exists()
145+
146+
def test_split_genomes_filter_on_stage_1(tmp_path: Path):
147+
setup_mock_idc_dir(tmp_path)
148+
split_path = tmp_path / "split"
149+
split_options = split_options_for(tmp_path)
150+
filters = Filters()
151+
filters.stage = 1
152+
split_options.filters = filters
153+
154+
split_genomes(split_options)
155+
filtered_out_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_fetch_genome_dbkeys_all_fasta"
156+
assert not filtered_out_task.exists()
157+
158+
filtered_in_task = split_path / "hg19_rCRS_pUC18_phiX174" / "data_manager_twobit_builder"
159+
assert filtered_in_task.exists()

0 commit comments

Comments
 (0)