4444log = logging .getLogger (__name__ )
4545
4646
47+ class Filters :
48+ stage : Optional [int ] = None
49+ data_manager : Optional [str ] = None
50+ build_id : Optional [str ] = None
51+
52+ def filter_out_data_manager (self , data_manager : str ) -> bool :
53+ return bool (self .data_manager and data_manager != self .data_manager )
54+
55+ def filter_out_build_id (self , build_id : str ) -> bool :
56+ return bool (self .build_id and build_id != self .build_id )
57+
58+ def filter_out_stage (self , stage : int ) -> bool :
59+ return bool (self .stage is not None and self .stage != stage )
60+
61+
4762class SplitOptions :
4863 merged_genomes_path : str
4964 split_genomes_path : str
5065 data_managers_path : str
5166 is_build_complete : IsBuildComplete
67+ filters : Filters = Filters ()
5268
5369
5470def tool_id_for (indexer : str , data_managers : Dict [str , DataManager ]) -> str :
@@ -94,34 +110,31 @@ def write_run_data_manager_to_file(run_data_manager: RunDataManager, path: str):
94110 yaml .safe_dump (run_data_managers .dict (exclude_unset = True ), of )
95111
96112
97- def split_genomes (split_options : SplitOptions ) -> None :
98-
99- def write_task_file (run_data_manager : RunDataManager , build_id : str , indexer : str ):
100- split_genomes_path = split_options .split_genomes_path
101- if not os .path .exists (split_options .split_genomes_path ):
102- safe_makedirs (split_genomes_path )
103-
104- task_file_dir = os .path .join (split_genomes_path , build_id , indexer )
105- task_file = os .path .join (task_file_dir , TASK_FILE_NAME )
106- write_run_data_manager_to_file (run_data_manager , task_file )
107-
113+ def walk_over_incomplete_runs (split_options : SplitOptions ):
108114 data_managers = read_data_managers_configuration (split_options .data_managers_path )
109115 with open (split_options .merged_genomes_path ) as f :
110116 genomes_all = yaml .safe_load (f )
111117 genomes = genomes_all ["genomes" ]
112118 for genome in genomes :
113119 build_id = genome ["id" ]
120+ if split_options .filters .filter_out_build_id (build_id ):
121+ continue
114122
115123 fetch_indexer = "data_manager_fetch_genome_dbkeys_all_fasta"
116- if not split_options .is_build_complete (build_id , fetch_indexer ):
124+ do_fetch = not split_options .filters .filter_out_data_manager (fetch_indexer )
125+ source = genome .get ("source" )
126+ if source is None :
127+ do_fetch = False
128+ if do_fetch and split_options .filters .filter_out_stage (0 ):
129+ do_fetch = False
130+
131+ if do_fetch and not split_options .is_build_complete (build_id , fetch_indexer ):
117132 log .info (f"Fetching: { build_id } " )
118133 fetch_tool_id = tool_id_for (fetch_indexer , data_managers )
119134 fetch_params = []
120135 fetch_params .append ({"dbkey_source|dbkey" : genome ["id" ]})
121136 source = genome .get ("source" )
122- if source is None :
123- continue
124- elif source == "ucsc" :
137+ if source == "ucsc" :
125138 fetch_params .append ({"reference_source|reference_source_selector" : "ucsc" })
126139 fetch_params .append ({"reference_source|requested_dbkey" : genome ["id" ]})
127140 fetch_params .append ({"sequence_name" : genome ["description" ]})
@@ -146,12 +159,18 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st
146159 # Not needed according to Marius
147160 # data_table_reload=["all_fasta", "__dbkeys__"],
148161 )
149- write_task_file ( fetch_run_data_manager , build_id , fetch_indexer )
162+ yield ( build_id , fetch_indexer , fetch_run_data_manager )
150163 else :
151164 log .debug (f"Fetch is already completed: { build_id } " )
152165
153166 indexers = genome .get ("indexers" , [])
154167 for indexer in indexers :
168+ if split_options .filters .filter_out_data_manager (indexer ):
169+ continue
170+
171+ if split_options .filters .filter_out_stage (1 ):
172+ continue
173+
155174 if split_options .is_build_complete (build_id , indexer ):
156175 log .debug (f"Build is already completed: { build_id } { indexer } " )
157176 continue
@@ -179,7 +198,22 @@ def write_task_file(run_data_manager: RunDataManager, build_id: str, indexer: st
179198 params = params ,
180199 items = [item ],
181200 )
182- write_task_file (run_data_manager , build_id , indexer )
201+ yield (build_id , indexer , run_data_manager )
202+
203+
204+ def split_genomes (split_options : SplitOptions ) -> None :
205+
206+ def write_task_file (build_id : str , indexer : str , run_data_manager : RunDataManager ):
207+ split_genomes_path = split_options .split_genomes_path
208+ if not os .path .exists (split_options .split_genomes_path ):
209+ safe_makedirs (split_genomes_path )
210+
211+ task_file_dir = os .path .join (split_genomes_path , build_id , indexer )
212+ task_file = os .path .join (task_file_dir , TASK_FILE_NAME )
213+ write_run_data_manager_to_file (run_data_manager , task_file )
214+
215+ for build_id , indexer , run_data_manager in walk_over_incomplete_runs (split_options ):
216+ write_task_file (build_id , indexer , run_data_manager )
183217
184218
185219class GalaxyHistoryIsBuildComplete :
@@ -199,6 +233,12 @@ def _parser():
199233 parser .add_argument ('--merged-genomes-path' , '-m' , default = "genomes.yml" )
200234 parser .add_argument ('--split-genomes-path' , '-s' , default = "data_manager_tasks" )
201235 parser .add_argument ('--data-managers-path' , default = "data_managers.yml" )
236+
237+ # filters
238+ parser .add_argument ('--filter-stage' , default = None )
239+ parser .add_argument ('--filter-data-manager' , default = None )
240+ parser .add_argument ('--filter-build-id' , default = None )
241+
202242 return parser
203243
204244
@@ -225,6 +265,12 @@ def main():
225265 split_options .split_genomes_path = args .split_genomes_path
226266 split_options .is_build_complete = is_build_complete
227267
268+ filters = Filters ()
269+ filters .build_id = args .filter_build_id
270+ filters .data_manager = args .filter_data_manager
271+ filters .stage = args .filter_stage
272+ split_options .filters = filters
273+
228274 split_genomes (split_options )
229275
230276
0 commit comments