@@ -3881,6 +3881,14 @@ def _add_datasets_to_history(self, history, elements, datasets_visible=False):
38813881 element_object .visible = datasets_visible
38823882 history .stage_addition (element_object )
38833883
3884+ @staticmethod
3885+ def _read_text_file_lines (path : str , size_hint : int = 10000000 ) -> list [str ]:
3886+ try :
3887+ with open (path ) as fh :
3888+ return fh .readlines (size_hint )
3889+ except UnicodeDecodeError :
3890+ raise exceptions .MessageException ("Please provide the file as valid UTF-8." )
3891+
38843892 def produce_outputs (self , trans : "ProvidesUserContext" , out_data , output_collections , incoming , history , ** kwds ):
38853893 return self ._outputs_dict ()
38863894
@@ -4505,9 +4513,9 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
45054513 old_elements_dict = {}
45064514 for element in elements :
45074515 old_elements_dict [element .element_identifier ] = element
4516+ sort_lines = self ._read_text_file_lines (hda .get_file_name ())
45084517 try :
4509- with open (hda .get_file_name ()) as fh :
4510- sorted_elements = [old_elements_dict [line .strip ()] for line in fh ]
4518+ sorted_elements = [old_elements_dict [line .strip ()] for line in sort_lines ]
45114519 except KeyError :
45124520 hdca_history_name = f"{ hdca .hid } : { hdca .name } "
45134521 message = f"List of element identifiers does not match element identifiers in collection '{ hdca_history_name } '"
@@ -4709,8 +4717,7 @@ def add_copied_value_to_new_elements(new_label, dce_object, columns):
47094717 new_rows [new_label ] = columns
47104718
47114719 new_labels_path = new_labels_dataset_assoc .get_file_name ()
4712- with open (new_labels_path ) as fh :
4713- new_labels = fh .readlines (1024 * 1000000 )
4720+ new_labels = self ._read_text_file_lines (new_labels_path )
47144721 if strict and len (hdca .collection .elements ) != len (new_labels ):
47154722 raise exceptions .MessageException ("Relabel mapping file contains incorrect number of identifiers" )
47164723 if how_type in ["tabular" , "tabular_extended" ]:
@@ -4866,8 +4873,7 @@ def add_copied_value_to_new_elements(new_tags_dict, dce):
48664873 new_elements [dce .element_identifier ] = copied_value
48674874
48684875 new_tags_path = new_tags_dataset_assoc .get_file_name ()
4869- with open (new_tags_path ) as fh :
4870- new_tags = fh .readlines (1024 * 1000000 )
4876+ new_tags = self ._read_text_file_lines (new_tags_path )
48714877 # We have a tabular file, where the first column is an existing element identifier,
48724878 # and the remaining columns represent new tags.
48734879 source_new_tags = (line .strip ().split ("\t " ) for line in new_tags )
@@ -4898,8 +4904,7 @@ def produce_outputs(self, trans, out_data, output_collections, incoming, history
48984904 discarded_rows = {}
48994905
49004906 filtered_path = filter_dataset_assoc .get_file_name ()
4901- with open (filtered_path ) as fh :
4902- filtered_identifiers = [i .strip () for i in fh .readlines (1024 * 1000000 )]
4907+ filtered_identifiers = [i .strip () for i in self ._read_text_file_lines (filtered_path )]
49034908
49044909 # If filtered_dataset_assoc is not a two-column tabular dataset we label with the current line of the dataset
49054910 for dce in hdca .collection .elements :
0 commit comments